Skip to content

Commit

Permalink
vm for translations project (#106)
Browse files Browse the repository at this point in the history
* wip

* fix image name

see https://cloud.google.com/deep-learning-vm/docs/images#listing-versions on how to get a list

* add worker-runner

* use latest tc components

* add note

* set project_id and zone temporarily

* fix copy-pasta

* create ubuntu-minimal scripts dir

* add steps to worker-runner-gw-systemd

* cleanup, comments

* set project to translations-sandbox

* gcp packer template: add ability to specify disk size

* set disk to 60gb

* pick better image family

* add poetry config

* wip, ugly

* use non-minimal, cleanup of misc

* disable kdump

* cuda: install via network

* fix url to g-w binary

* remove notes

* wip, headers are present, but config issues

* checkpoint: symlink issues solved?

* remove unused script dir

* indents

* revert code to disable cloud-init

* remove disable_cloud_init

* remove comment

* pyproject: add description

* remove poetry change from this PR, will make new

* comment cleanup

* more cleanup

* wr config cleanup

* wip: adding cudnn

* add singularity pre-reqs

* wip fixes

* apt-get install needs -y

* rework singularity installation

* cudnn cleanup

* install cuda keyring from /tmp

* wr-gw-systemd: add task to create tc job dirs

* header name change fixes

* tc dir creation fixes

* g-w runs as ubuntu, install additional packages

* renames and fixes

* revert systemd unit file location change

* comment tweak

* rename, install zstandard via pip

* add libhunspell packages

* Update scripts/ubuntu-cuda/README.md

Co-authored-by: Yarik <lotas@users.noreply.github.com>

---------

Co-authored-by: Pete Moore <pmoore@mozilla.com>
Co-authored-by: Yarik <lotas@users.noreply.github.com>
  • Loading branch information
3 people authored May 12, 2023
1 parent 600a40d commit 118673c
Show file tree
Hide file tree
Showing 23 changed files with 443 additions and 3 deletions.
18 changes: 18 additions & 0 deletions builders/generic_translations_gcp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
template: googlecompute
platform: linux
# machine_type: # TODO: use a larger instance for faster builds (singularity)

builder_var_files:
- taskcluster_version_latest # new file as non-latest still needed for d-w
- default_linux
- translations_gcp # TODO: merge this and following?
- googlecompute_translations

script_directories:
- ubuntu-jammy
- ubuntu-worker-requirements
- ubuntu-cuda
- generic-worker-linux
- worker-runner-linux
- worker-runner-gw-systemd # TODO: merge with 'generic-worker-linux'?

2 changes: 1 addition & 1 deletion scripts/generic-worker-linux/01-install-generic-worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ TASKCLUSTER_PROXY_VERSION='v5.1.0'
# install generic-worker into /home/ubuntu/generic-worker
mkdir -p /home/ubuntu/generic-worker
cd /home/ubuntu/generic-worker
retry curl -L "https://github.com/taskcluster/taskcluster/releases/download/${GENERIC_WORKER_VERSION}/generic-worker-simple-linux-amd64" > generic-worker
retry curl -L "https://github.com/taskcluster/taskcluster/releases/download/v${GENERIC_WORKER_VERSION}/generic-worker-simple-linux-amd64" > generic-worker
retry curl -L "https://github.com/taskcluster/livelog/releases/download/${LIVELOG_VERSION}/livelog-linux-amd64" > livelog
retry curl -L "https://github.com/taskcluster/taskcluster-proxy/releases/download/${TASKCLUSTER_PROXY_VERSION}/taskcluster-proxy-linux-amd64" > taskcluster-proxy
chmod a+x generic-worker taskcluster-proxy livelog
Expand Down
52 changes: 52 additions & 0 deletions scripts/ubuntu-cuda/10-install-cuda-libs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

#
# prepare
#

# problem: broken symlinks in directory below
# ls -la ls /usr/src/linux-headers-5.15.0-1030-gcp
# fix:
# sudo apt reinstall linux-gcp-headers-5.15.0-1030
#
# ensure kernel headers are present so dkms works
# - had issue where there were broken symlinks

# fix 1:
# not needed now? package name changed
# linux-gcp-headers-`uname -r | sed -r s/-gcp//`

# fix 2
# now: linux-gcp-5.19-headers-5.19.0-1021
# `uname -r`: 5.19.0-1021-gcp
version=`uname -r`
version_minus_dash_gcp=`uname -r | sed -r s/-gcp//`
short_version=`uname -r | cut -d "." -f1,2`
pkg_name="linux-gcp-${short_version}-headers-${version_minus_dash_gcp}"

sudo apt-get update
sudo apt-get -y reinstall linux-headers-gcp linux-headers-`uname -r` ${pkg_name}

# TODO: install future kernel headers (latest in `dpkg --list 'linux-image*'` output)
# in addition to current (uname -r)?
# - not needed as we're compiling on what's running?


#
# install driver
#

# install cuda from network (runfile demands input and fails)
cd /tmp
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda
30 changes: 30 additions & 0 deletions scripts/ubuntu-cuda/20-install-libcudnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# see https://developer.nvidia.com/cudnn
# steps from https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html
# alternate resource: https://gist.github.com/valgur/fcd72fcdf5db81a826f8ff9802621d75

# official steps

UBUNTU_RELEASE=$(lsb_release -rs) # 18.04
DISTRO=ubuntu${UBUNTU_RELEASE//\./} # ubuntu1804
cuda_version="cuda12.0"
cudnn_version="8.8.1.*"

wget https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/x86_64/cuda-${DISTRO}.pin

sudo mv cuda-${DISTRO}.pin /etc/apt/preferences.d/cuda-repository-pin-600
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/x86_64/3bf863cc.pub
sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}/x86_64/ /"
sudo apt-get update

sudo apt-get install libcudnn8=${cudnn_version}-1+${cuda_version}
sudo apt-get install libcudnn8-dev=${cudnn_version}-1+${cuda_version}
31 changes: 31 additions & 0 deletions scripts/ubuntu-cuda/30-install-singularity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# see https://cloud.sylabs.io/ for more info
# steps from https://docs.sylabs.io/guides/3.11/admin-guide/installation.html

# from official deb

# pre-reqs
sudo apt-get install -y \
build-essential \
libseccomp-dev \
libglib2.0-dev \
pkg-config \
squashfs-tools \
cryptsetup \
runc \
uidmap

# install deb
cd /tmp
wget https://github.com/sylabs/singularity/releases/download/v3.11.1/singularity-ce_3.11.1-jammy_amd64.deb
dpkg -i singularity-ce_3.11.1-jammy_amd64.deb
rm *.deb
14 changes: 14 additions & 0 deletions scripts/ubuntu-cuda/99-clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# rm -rf /usr/src/*

# Do one final package cleanup, just in case.
apt-get autoremove -y --purge
6 changes: 6 additions & 0 deletions scripts/ubuntu-cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# ubuntu-cuda

## TODO

- rename to ubuntu-translations

39 changes: 39 additions & 0 deletions scripts/ubuntu-jammy/01-kernel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

KERNEL_VERSION=$(uname -r)
echo "KERNEL_VERSION=$KERNEL_VERSION"

# prevents interactive installation
echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
export DEBIAN_FRONTEND=noninteractive

retry apt-get update

# TODO: figure out issue on jammy
# error:
# The following packages have unmet dependencies:
# linux-crashdump : Depends: kdump-tools but it is not installable
#
# install crash debug tools
# retry apt-get install -y linux-crashdump kmod

# kernel debug
# grep 'USE_KDUMP' /etc/default/kdump-tools
# echo 'USE_KDUMP=1' >> /etc/default/kdump-tools

# Ensure that we load AWS / Nitro modules
if [ "$CLOUD" = "aws" ]; then
echo "ena" | tee --append /etc/modules
echo "nvme" | tee --append /etc/modules
fi

# At this point we need a reboot to handle the kernel update
# this is handled in the grub script
41 changes: 41 additions & 0 deletions scripts/ubuntu-jammy/02-grub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# GRUB
# adapted from https://bgstack15.wordpress.com/2018/05/02/update-etc-default-grub-programmatically/
GRUB_INFILE=/etc/default/grub

cp -p "${GRUB_INFILE}" "${GRUB_INFILE}.orig"

TMP_DIR="$(mktemp -d)"
TMP_FILE="$(TMPDIR="${TMP_DIR}" mktemp)"

# clean up temp file if necessary
test ! -e "${TMP_FILE}" && { touch "${TMP_FILE}" || exit 1 ; }
cat "${GRUB_INFILE}" > "${TMP_FILE}"

add_value_to_grub_line "${TMP_FILE}" "GRUB_CMDLINE_LINUX" "debug g"
add_value_to_grub_line "${TMP_FILE}" "GRUB_CMDLINE_LINUX_DEFAULT" "splash"
remove_value_from_grub_line "${TMP_FILE}" "GRUB_CMDLINE_LINUX_DEFAULT" "quiet"

update_grub_if_changed "${GRUB_INFILE}" "${TMP_FILE}"

# show final results
cat "${GRUB_INFILE}"
rm -rf "${TMP_DIR}" 2>/dev/null

# FIXME does not exist?
# shown here https://launchpad.net/ubuntu/+source/linux-signed/4.15.0-58.64
# retry apt-get install -y linux-image-$KERNEL_VERSION-dbgsym

# Shutdown and wait forever; packer will consider this script to have finished and
# start on the next script when it reconnects
shutdown -r now
while true; do sleep 1; done
33 changes: 33 additions & 0 deletions scripts/ubuntu-jammy/30-packages.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

retry apt-get update
retry apt-get upgrade -y

# docker wants these
retry apt-get install -y \
apt-transport-https \
build-essential \
ca-certificates \
curl \
gnupg-agent \
software-properties-common

MISC_PACKAGES=()
MISC_PACKAGES+=(zstd python3-pip jq)
# docker-worker needs this for unpacking lz4 images
MISC_PACKAGES+=(liblz4-tool)

# misc
retry apt-get install -y ${MISC_PACKAGES[@]}

# Remove apport because it prevents obtaining crashes from containers
# and because it may send data to Canonical.
apt-get purge -y apport
14 changes: 14 additions & 0 deletions scripts/ubuntu-jammy/60-networking.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# https://github.com/moby/libnetwork/issues/1090
retry apt-get install -y iptables-persistent
iptables -I INPUT -m conntrack --ctstate INVALID -j DROP
iptables-save > /etc/iptables/rules.v4
14 changes: 14 additions & 0 deletions scripts/ubuntu-jammy/99-clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

rm -rf /usr/src/*

# Do one final package cleanup, just in case.
apt-get autoremove -y --purge
27 changes: 27 additions & 0 deletions scripts/ubuntu-worker-requirements/01-deb-packages.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# update apt first
retry apt-get update

# docker wants these
retry apt-get install -y \
build-essential \
git \
libhunspell-1.7-0 \
libhunspell-dev \
mercurial \
python3-zstd \
python3-certifi \
python3-psutil \
zstd

# Do one final package cleanup, just in case.
apt-get autoremove -y --purge
12 changes: 12 additions & 0 deletions scripts/ubuntu-worker-requirements/05-python3-packages.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

# no deb available for this, install via pip3
pip3 install zstandard
22 changes: 22 additions & 0 deletions scripts/worker-runner-gw-systemd/10-create-worker-runner-config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -exv

# init helpers
helpers_dir=${MONOPACKER_HELPERS_DIR:-"/etc/monopacker/scripts"}
for h in ${helpers_dir}/*.sh; do
. $h;
done

worker_runner_config="/etc/start-worker.yml"

# taken from https://github.com/mozilla-platform-ops/ronin_puppet/blob/master/modules/linux_generic_worker/templates/worker-runner-config.yml.erb

cat << EOF > "${worker_runner_config}"
provider:
providerType: ${CLOUD}
worker:
implementation: generic-worker
path: /home/ubuntu/generic-worker/generic-worker
configPath: /home/ubuntu/generic-worker/generic-worker.config
EOF
Loading

0 comments on commit 118673c

Please sign in to comment.