Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the dockerfile base image to cuda-dl-base #1248

Merged
merged 18 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 4 additions & 73 deletions .github/container/Dockerfile.base
Steboss marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04
ARG GIT_USER_NAME="JAX Toolbox"
ARG GIT_USER_EMAIL=jax@nvidia.com
ARG CLANG_VERSION=18
ARG JAX_TOOLBOX_REF

###############################################################################
## Obtain GCP's NCCL TCPx plugin
###############################################################################

FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64

# make a stub arm64 container because GCP does not provide an arm64 version of the plugin
FROM ubuntu AS tcpx-installer-arm64
RUN <<"OUTEREOF" bash -ex
mkdir -p /scripts /var/lib/tcpx/lib64
echo '#!/bin/bash' > /scripts/container_entry.sh
chmod +x /scripts/container_entry.sh
OUTEREOF

FROM tcpx-installer-${TARGETARCH} AS tcpx-installer
RUN /scripts/container_entry.sh install

###############################################################################
## Build base image
###############################################################################
Expand Down Expand Up @@ -153,72 +136,20 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*

###############################################################################
## Install TCPx
###############################################################################

ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64
COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH}

###############################################################################
## Install the latest versions of Nsight Systems and Nsight Compute
###############################################################################

ADD install-nsight.sh /usr/local/bin
olupton marked this conversation as resolved.
Show resolved Hide resolved
RUN install-nsight.sh

###############################################################################
## Install cuDNN
## Symlink for cuDNN
Steboss marked this conversation as resolved.
Show resolved Hide resolved
###############################################################################

ADD install-cudnn.sh /usr/local/bin
RUN install-cudnn.sh

###############################################################################
## Install NCCL
## Symlink for NCCL
###############################################################################

# same fro this
Steboss marked this conversation as resolved.
Show resolved Hide resolved
Steboss marked this conversation as resolved.
Show resolved Hide resolved
ADD install-nccl.sh /usr/local/bin
RUN install-nccl.sh

###############################################################################
## RoCE and InfiniteBand support
###############################################################################

ADD install-ofed.sh /usr/local/bin
RUN install-ofed.sh
olupton marked this conversation as resolved.
Show resolved Hide resolved

##############################################################################
## Amazon EFA support (need to run it inside container separately)
##############################################################################

ADD --chmod=777 \
install-efa.sh \
olupton marked this conversation as resolved.
Show resolved Hide resolved
test-aws-efa.sh \
olupton marked this conversation as resolved.
Show resolved Hide resolved
/usr/local/bin/
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
ENV PATH=/opt/amazon/efa/bin:${PATH}

##############################################################################
## NCCL sanity check utility
##############################################################################

ADD install-nccl-sanity-check.sh /usr/local/bin
ADD nccl-sanity-check.cu /opt
RUN install-nccl-sanity-check.sh
olupton marked this conversation as resolved.
Show resolved Hide resolved
ADD jax-nccl-test parallel-launch /usr/local/bin/
olupton marked this conversation as resolved.
Show resolved Hide resolved

###############################################################################
## Add the systemcheck to the entrypoint.
###############################################################################

COPY check-shm.sh /opt/nvidia/entrypoint.d/
Steboss marked this conversation as resolved.
Show resolved Hide resolved
olupton marked this conversation as resolved.
Show resolved Hide resolved

###############################################################################
## Add the GCP - TCPX check to the entrypoint.
###############################################################################

# TODO(chaserileyroberts): Reenable once fully tested on GCP.
# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/
Steboss marked this conversation as resolved.
Show resolved Hide resolved

###############################################################################
## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems
Expand Down
72 changes: 31 additions & 41 deletions .github/container/install-cudnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,6 @@

set -ex

export DEBIAN_FRONTEND=noninteractive
export TZ=America/Los_Angeles

CUDNN_MAJOR_VERSION=9

apt-get update

# Extract major CUDA version from `nvcc --version` output line
# Input: "Cuda compilation tools, release X.Y, VX.Y.Z"
# Output: X
cuda_major_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$/\1/p' | cut -d. -f1)

# Find latest cuDNN version compatible with existing CUDA by matching
# ${cuda_major_version} in the package version string
# In most cases cuDNN release is behind CUDA ones. It is considered, that major
# version of CUDA and cuDNN are compatible.
# For example, CUDA 12.3 + cuDNN 8.9.6 (libcudnn8 version: 8.9.6.50-1+cuda12.2) is
# considered to be compatible.
if [[ ${CUDNN_MAJOR_VERSION} -le 8 ]]; then
libcudnn_name=libcudnn${CUDNN_MAJOR_VERSION}
libcudnn_dev_name=libcudnn${CUDNN_MAJOR_VERSION}-dev
version_pattern="s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p"
elif [[ ${CUDNN_MAJOR_VERSION} -eq 9 ]]; then
libcudnn_name=libcudnn${CUDNN_MAJOR_VERSION}-cuda-${cuda_major_version}
libcudnn_dev_name=libcudnn${CUDNN_MAJOR_VERSION}-dev-cuda-${cuda_major_version}
version_pattern="s/^Version: \(${CUDNN_MAJOR_VERSION}\.[0-9.-]*\)$/\1/p"
fi
libcudnn_version=$(apt-cache show $libcudnn_name | sed -n "$version_pattern" | head -n 1)
libcudnn_dev_version=$(apt-cache show $libcudnn_dev_name | sed -n "$version_pattern" | head -n 1)
if [[ -z "${libcudnn_version}" || -z "${libcudnn_dev_version}" ]]; then
echo "Could not find compatible cuDNN version for CUDA ${cuda_version}"
exit 1
fi

apt-get install -y \
${libcudnn_name}=${libcudnn_version} \
${libcudnn_dev_name}=${libcudnn_dev_version}
apt-get clean
rm -rf /var/lib/apt/lists/*

# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN
# version that was just installed; this is useful to pass to XLA to avoid it fetching
# its own copy of cuDNN.
Expand All @@ -50,8 +10,15 @@ if [[ -d "${prefix}" ]]; then
echo "Skipping link farm creation"
exit 1
fi

arch=$(uname -m)-linux-gnu
for cudnn_file in $(dpkg -L ${libcudnn_name} ${libcudnn_dev_name} | sort -u); do
libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}')
if [[ -z "${libcudnn_pkgs}" ]]; then
echo "No libcudnn packages installed."
exit 1
fi

for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do
# Real files and symlinks are linked into $prefix
if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then
# Replace /usr with $prefix
Expand All @@ -70,3 +37,26 @@ for cudnn_file in $(dpkg -L ${libcudnn_name} ${libcudnn_dev_name} | sort -u); do
echo "Skipping ${cudnn_file}"
fi
done

# replicate the original symlinks too, so we'll have /opt/nvidia/cudnn/include/cudnn.sh
find /usr/include -maxdepth 1 -name "cudnn*.h" -type l | while read -r symlink; do
symlink_name=$(basename "${symlink}")
symlink_target=$(readlink "${symlink}")
# Check if the symlink points to x86_64-linux-gnu/
if [[ "${symlink_target}" == "${arch}/"* ]]; then
# Adjust the symlink target to point within our symlink directory
adjusted_target="${prefix}/include/${symlink_target#${arch}/}"
# Destination symlink within the symlink directory
link_name="${prefix}/include/${symlink_name}"
link_dir=$(dirname "${link_name}")
mkdir -p "${link_dir}"
# Check if the symlink already exists
if [[ -e "${link_name}" ]]; then
echo "Symlink ${link_name} already exists. Skipping."
else
ln -s "${adjusted_target}" "${link_name}"
fi
else
echo "Skipping symlink ${symlink} with target ${symlink_target}"
fi
done
Steboss marked this conversation as resolved.
Show resolved Hide resolved
41 changes: 9 additions & 32 deletions .github/container/install-nccl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,6 @@

set -ex -o pipefail

export DEBIAN_FRONTEND=noninteractive
export TZ=America/Los_Angeles

# If NCCL is already installed, don't reinstall it. Print a message and exit
if dpkg -s libnccl2 libnccl-dev &> /dev/null; then
echo "NCCL is already installed. Skipping installation."
else
apt-get update

# Extract CUDA version from `nvcc --version` output line
# Input: "Cuda compilation tools, release X.Y, VX.Y.Z"
# Output: X.Y
cuda_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$/\1/p')

# Find latest NCCL version compatible with existing CUDA by matching
# ${cuda_version} in the package version string
libnccl2_version=$(apt-cache show libnccl-dev | sed -n "s/^Version: \(.*+cuda${cuda_version}\)$/\1/p" | head -n 1)
libnccl_dev_version=$(apt-cache show libnccl-dev | sed -n "s/^Version: \(.*+cuda${cuda_version}\)$/\1/p" | head -n 1)
if [[ -z "${libnccl2_version}" || -z "${libnccl_dev_version}" ]]; then
echo "Could not find compatible NCCL version for CUDA ${cuda_version}"
exit 1
fi

apt-get install -y \
libnccl2=${libnccl2_version} \
libnccl-dev=${libnccl_dev_version}

apt-get clean
rm -rf /var/lib/apt/lists/*
fi

# Create a prefix with include/ and lib/ directories containing symlinks to the NCCL
# version installed at the system level; this is useful to pass to XLA to avoid it
# fetching its own copy.
Expand All @@ -42,7 +11,15 @@ if [[ -d "${prefix}" ]]; then
exit 1
fi
arch=$(uname -m)-linux-gnu
for nccl_file in $(dpkg -L libnccl2 libnccl-dev | sort -u); do
olupton marked this conversation as resolved.
Show resolved Hide resolved
nccl_packages=$(dpkg -l 'libnccl*' | awk '/^ii/ {print $2}')

if [[ -z "${nccl_packages}" ]]; then
echo "No NCCL packages installed."
exit 1
fi


for nccl_file in $(dpkg -L ${nccl_packages} | sort -u); do
# Real files and symlinks are linked into $prefix
if [[ -f "${nccl_file}" || -h "${nccl_file}" ]]; then
# Replace /usr with $prefix and remove arch-specific lib directories
Expand Down
Loading