From 137c70e3136c86e592e186631038f9b2d7d611c3 Mon Sep 17 00:00:00 2001 From: Julien Chastang Date: Thu, 14 Nov 2024 15:05:15 -0700 Subject: [PATCH 1/9] graphcast hub --- jupyter-images/fall-2024/tm/Dockerfile | 38 ++++++++++++ jupyter-images/fall-2024/tm/environment.yml | 29 +++++++++ jupyter-images/fall-2024/tm/secrets.yaml | 66 +++++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 jupyter-images/fall-2024/tm/Dockerfile create mode 100644 jupyter-images/fall-2024/tm/environment.yml create mode 100644 jupyter-images/fall-2024/tm/secrets.yaml diff --git a/jupyter-images/fall-2024/tm/Dockerfile b/jupyter-images/fall-2024/tm/Dockerfile new file mode 100644 index 00000000..34ba15e1 --- /dev/null +++ b/jupyter-images/fall-2024/tm/Dockerfile @@ -0,0 +1,38 @@ +# Heavily borrowed from docker-stacks/minimal-notebook/ +# https://github.com/jupyter/docker-stacks/blob/main/minimal-notebook/Dockerfile + +ARG BASE_CONTAINER=jupyter/minimal-notebook +FROM $BASE_CONTAINER + +ENV DEFAULT_ENV_NAME=tm-fall-2024 EDITOR=nano VISUAL=nano + +LABEL maintainer="Unidata " + +USER root + +RUN apt-get update && \ + apt-get install -y --no-install-recommends vim nano curl zip unzip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +USER $NB_UID + +ADD environment.yml /tmp + +RUN conda install --quiet --yes \ + 'conda-forge::nb_conda_kernels' \ + 'conda-forge::jupyterlab-git' \ + 'conda-forge::ipywidgets' && \ + conda env update --name $DEFAULT_ENV_NAME -f /tmp/environment.yml && \ + pip install --no-cache-dir nbgitpuller && \ + conda clean --all -f -y && \ + jupyter lab clean -y && \ + npm cache clean --force && \ + rm -rf /home/$NB_USER/.cache/yarn && \ + rm -rf /home/$NB_USER/.node-gyp && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER + +COPY Acknowledgements.ipynb / + +USER $NB_UID diff --git a/jupyter-images/fall-2024/tm/environment.yml b/jupyter-images/fall-2024/tm/environment.yml new file mode 100644 index 00000000..314ef895 --- /dev/null +++ b/jupyter-images/fall-2024/tm/environment.yml @@ -0,0 +1,29 @@ +name: tm-fall-2024 +channels: + - nvidia + - pytorch + - defaults + - conda-forge +dependencies: + # Required by JupyterLab + - python=3.10 + - nb_conda_kernels + - ipykernel + - notebook + - jupyter_server + # User requested packages + - cartopy + - cudatoolkit + - ipywidgets + - matplotlib + - metpy + - numpy + - pandas + - pip + - pytorch-cuda=12.1 + - scikit-learn + - seaborn + - siphon + - xarray + - pip: + - ai-models-fourcastnetv2 diff --git a/jupyter-images/fall-2024/tm/secrets.yaml b/jupyter-images/fall-2024/tm/secrets.yaml new file mode 100644 index 00000000..03905209 --- /dev/null +++ b/jupyter-images/fall-2024/tm/secrets.yaml @@ -0,0 +1,66 @@ +hub: + cookieSecret: "xxx" + config: + Authenticator: + admin_users: + - admins + #If you have a large list of users, consider using allowed_users.yaml + allowed_users: + - users + # necessary for jhub admins to add user via admin page `/hub/admin` + allow_existing_users: true + GitHubOAuthenticator: + client_id: "xxx" + client_secret: "xxx" + oauth_callback_url: "https://tm24f-1.ees220002.projects.jetstream-cloud.org:443/oauth_callback" + JupyterHub: + authenticator_class: github + +proxy: + secretToken: "xxx" + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: "nginx" + cert-manager.io/cluster-issuer: "letsencrypt" + #For manually issuing certificates: see vms/jupyter/readme.md + #cert-manager.io/issuer: "incommon" + nginx.ingress.kubernetes.io/proxy-body-size: 500m + hosts: + - "tm24f-1.ees220002.projects.jetstream-cloud.org" + tls: + - hosts: + - "tm24f-1.ees220002.projects.jetstream-cloud.org" + secretName: certmanager-tls-jupyterhub + +singleuser: + extraEnv: + NBGITPULLER_DEPTH: "0" + storage: + capacity: 10Gi + startTimeout: 600 + memory: + guarantee: 4G + limit: 4G + cpu: + guarantee: 1 + limit: 2 + defaultUrl: "/lab" + image: + name: "unidata/tm24f" + tag: "xxx" + lifecycleHooks: + postStart: + exec: + command: + - "bash" + - "-c" + - > + dir="/home/jovyan/.ssh"; [ -d $dir ] && { chmod 700 $dir && chmod -f 600 $dir/* && chmod -f 644 $dir/*.pub; } || true; + cp -t /home/jovyan /Acknowledgements.ipynb; + [[ -f $HOME/.bashrc ]] || cp /etc/skel/.bashrc $HOME/; + [[ -f $HOME/.profile ]] || cp /etc/skel/.profile $HOME/; + [[ -f $HOME/.bash_logout ]] || cp /etc/skel/.bash_logout $HOME/; + [[ -f $HOME/.condarc ]] || cp /.condarc $HOME/; + [ -d "/share" ] && [ ! -L ~/share ] && ln -s /share ~/share || true; From 5bdaf33cfa35b6615f132d78936da1d53740f162 Mon Sep 17 00:00:00 2001 From: ana espinoza Date: Wed, 20 Nov 2024 16:23:19 -0700 Subject: [PATCH 2/9] Add some niceties; conda-->mamba --- jupyter-images/fall-2024/tm/.condarc | 2 + .../fall-2024/tm/Acknowledgements.ipynb | 43 ++++++++++++ jupyter-images/fall-2024/tm/Dockerfile | 12 ++-- jupyter-images/fall-2024/tm/build.sh | 28 ++++++++ jupyter-images/fall-2024/tm/default_kernel.py | 67 +++++++++++++++++++ jupyter-images/fall-2024/tm/overrides.json | 7 ++ 6 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 jupyter-images/fall-2024/tm/.condarc create mode 100644 jupyter-images/fall-2024/tm/Acknowledgements.ipynb create mode 100755 jupyter-images/fall-2024/tm/build.sh create mode 100755 jupyter-images/fall-2024/tm/default_kernel.py create mode 100644 jupyter-images/fall-2024/tm/overrides.json diff --git a/jupyter-images/fall-2024/tm/.condarc b/jupyter-images/fall-2024/tm/.condarc new file mode 100644 index 00000000..7c536d94 --- /dev/null +++ b/jupyter-images/fall-2024/tm/.condarc @@ -0,0 +1,2 @@ +envs_dirs: + - /home/jovyan/additional-envs diff --git a/jupyter-images/fall-2024/tm/Acknowledgements.ipynb b/jupyter-images/fall-2024/tm/Acknowledgements.ipynb new file mode 100644 index 00000000..fae591fc --- /dev/null +++ b/jupyter-images/fall-2024/tm/Acknowledgements.ipynb @@ -0,0 +1,43 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c86cd54f-b73c-4781-b6eb-89c79d3d3b22", + "metadata": {}, + "source": [ + "## Acknowledgements\n", + "\n", + "Launching this JupyterHub server is the result of a collaboration between several research and academic institutions and their staff. For Jetstream2 and JupyterHub expertise, we thank Andrea Zonca (San Diego Supercomputing Center), Jeremy Fischer, Mike Lowe (Indiana University), the NSF Jetstream2 (`doi:10.1145/3437359.3465565`) team.\n", + "\n", + "This work employs the NSF Jetstream2 Cloud at Indiana University through allocation EES220002 from the Advanced Cyberinfrastructure Coordination Ecosystem: Services & Support (ACCESS) program, which is supported by National Science Foundation grants #2138259, #2138286, #2138307, #2137603, and #2138296.\n", + "\n", + "Unidata is one of the University Corporation for Atmospheric Research (UCAR)'s Community Programs (UCP), and is funded primarily by the National Science Foundation (AGS-2403649).\n", + "\n", + "## To Acknowledge This JupyterHub and the Unidata Science Gateway\n", + "\n", + "If you have benefited from the Unidata Science Gateway, please cite `doi:10.5065/688s-2w73`. Additional citation information can be found in this [Citation File Format file](https://raw.githubusercontent.com/Unidata/science-gateway/master/CITATION.cff).\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter-images/fall-2024/tm/Dockerfile b/jupyter-images/fall-2024/tm/Dockerfile index 34ba15e1..edd3ea70 100644 --- a/jupyter-images/fall-2024/tm/Dockerfile +++ b/jupyter-images/fall-2024/tm/Dockerfile @@ -19,13 +19,13 @@ USER $NB_UID ADD environment.yml /tmp -RUN conda install --quiet --yes \ +RUN mamba install --quiet --yes \ 'conda-forge::nb_conda_kernels' \ 'conda-forge::jupyterlab-git' \ 'conda-forge::ipywidgets' && \ - conda env update --name $DEFAULT_ENV_NAME -f /tmp/environment.yml && \ + mamba env update --name $DEFAULT_ENV_NAME -f /tmp/environment.yml && \ pip install --no-cache-dir nbgitpuller && \ - conda clean --all -f -y && \ + mamba clean --all -f -y && \ jupyter lab clean -y && \ npm cache clean --force && \ rm -rf /home/$NB_USER/.cache/yarn && \ @@ -33,6 +33,10 @@ RUN conda install --quiet --yes \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -COPY Acknowledgements.ipynb / +COPY Acknowledgements.ipynb \ + default_kernel.py .condarc / + +ARG JUPYTER_SETTINGS_DIR=/opt/conda/share/jupyter/lab/settings/ +COPY overrides.json $JUPYTER_SETTINGS_DIR USER $NB_UID diff --git a/jupyter-images/fall-2024/tm/build.sh b/jupyter-images/fall-2024/tm/build.sh new file mode 100755 index 00000000..2b6fad0a --- /dev/null +++ b/jupyter-images/fall-2024/tm/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Check if an image name is provided +if [ -z "$1" ]; then + echo "Error: No image name provided." + echo "Usage: $0 " + exit 1 +fi + +IMAGE_NAME=$1 + +DATE_TAG=$(date "+%Y%b%d_%H%M%S") +RANDOM_HEX=$(openssl rand -hex 2) +TAG="${DATE_TAG}_${RANDOM_HEX}" + +FULL_TAG="unidata/$IMAGE_NAME:$TAG" + +echo "Building Docker image with tag: $FULL_TAG" + +docker build --no-cache --pull --tag "$FULL_TAG" . + +# Check if the build was successful +if [ $? -eq 0 ]; then + echo "Docker image built successfully: $FULL_TAG" +else + echo "Error: Docker build failed." + exit 1 +fi diff --git a/jupyter-images/fall-2024/tm/default_kernel.py b/jupyter-images/fall-2024/tm/default_kernel.py new file mode 100755 index 00000000..80a432f3 --- /dev/null +++ b/jupyter-images/fall-2024/tm/default_kernel.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +import argparse +import glob +import json +import os +import re + + +def update_kernelspec_in_notebooks(directory, new_name): + """ + Updates the kernelspec in all Jupyter Notebook files within the specified + directory and its subdirectories, while preserving the original file + formatting. + + Args: + directory (str): The path to the directory containing .ipynb files. + new_name (str): The new name to set in the kernelspec. + """ + for file_path in glob.glob(f'{directory}/**/*.ipynb', recursive=True): + try: + with open(file_path, 'r', encoding='utf-8') as file: + file_contents = file.read() + notebook = json.loads(file_contents) + + if 'kernelspec' not in notebook.get('metadata', {}): + print(f"No kernelspec found in {file_path}. Skipping file.") + continue + + kernelspec = notebook['metadata']['kernelspec'] + kernelspec['display_name'] = f"Python [conda env:{new_name}]" + kernelspec['name'] = f"conda-env-{new_name}-py" + + # Convert the updated kernelspec dictionary to a JSON-formatted + # string with indentation + updated_kernelspec = json.dumps(kernelspec, indent=4) + + # Replace the existing kernelspec section in the original file + # contents with the updated JSON string. The regular expression + # looks for the "kernelspec" key and replaces its entire value + # (including nested structures), preserving the overall structure + # and formatting of the file. + updated_contents = re.sub( + r'"kernelspec": *\{.*?\}', + f'"kernelspec": {updated_kernelspec}', + file_contents, flags=re.DOTALL + ) + + with open(file_path, 'w', encoding='utf-8') as file: + file.write(updated_contents) + + except Exception as e: + print(f"Error processing file {file_path}: {e}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Update the kernel name in " + "Jupyter Notebook files in directory " + "tree.") + parser.add_argument("new_kernel_name", help="New kernel name to set.") + parser.add_argument("directory_path", nargs='?', default=os.getcwd(), + help="Directory containing .ipynb files (default: " + "current directory).") + + args = parser.parse_args() + + update_kernelspec_in_notebooks(args.directory_path, args.new_kernel_name) diff --git a/jupyter-images/fall-2024/tm/overrides.json b/jupyter-images/fall-2024/tm/overrides.json new file mode 100644 index 00000000..c60d6b61 --- /dev/null +++ b/jupyter-images/fall-2024/tm/overrides.json @@ -0,0 +1,7 @@ +{ + "@jupyterlab/docmanager-extension:plugin": { + "defaultViewers": { + "markdown": "Markdown Preview" + } + } +} From 16dd45a3b6737559aeeab08eb99ee62215beef12 Mon Sep 17 00:00:00 2001 From: ana espinoza Date: Wed, 20 Nov 2024 17:29:18 -0700 Subject: [PATCH 3/9] Modify resource limits --- jupyter-images/fall-2024/tm/secrets.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jupyter-images/fall-2024/tm/secrets.yaml b/jupyter-images/fall-2024/tm/secrets.yaml index 03905209..9a2e82d3 100644 --- a/jupyter-images/fall-2024/tm/secrets.yaml +++ b/jupyter-images/fall-2024/tm/secrets.yaml @@ -41,11 +41,11 @@ singleuser: capacity: 10Gi startTimeout: 600 memory: - guarantee: 4G - limit: 4G + guarantee: 24G + limit: 24G cpu: - guarantee: 1 - limit: 2 + guarantee: 6 + limit: 6 defaultUrl: "/lab" image: name: "unidata/tm24f" From 1b34df15d3de261a0368e41b3637234265ba4e3d Mon Sep 17 00:00:00 2001 From: Julien Chastang Date: Thu, 21 Nov 2024 10:21:00 -0700 Subject: [PATCH 4/9] additional niceties --- jupyter-images/fall-2024/tm/Dockerfile | 2 +- .../fall-2024/tm/GPU_sanity_check.ipynb | 287 ++++++++++++++++++ .../fall-2024/tm/gpu/jupyterhub_gpu.yaml | 14 + jupyter-images/fall-2024/tm/secrets.yaml | 2 +- 4 files changed, 303 insertions(+), 2 deletions(-) create mode 100644 jupyter-images/fall-2024/tm/GPU_sanity_check.ipynb create mode 100644 jupyter-images/fall-2024/tm/gpu/jupyterhub_gpu.yaml diff --git a/jupyter-images/fall-2024/tm/Dockerfile b/jupyter-images/fall-2024/tm/Dockerfile index edd3ea70..339cc9e4 100644 --- a/jupyter-images/fall-2024/tm/Dockerfile +++ b/jupyter-images/fall-2024/tm/Dockerfile @@ -33,7 +33,7 @@ RUN mamba install --quiet --yes \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -COPY Acknowledgements.ipynb \ +COPY GPU_sanity_check.ipynb Acknowledgements.ipynb \ default_kernel.py .condarc / ARG JUPYTER_SETTINGS_DIR=/opt/conda/share/jupyter/lab/settings/ diff --git a/jupyter-images/fall-2024/tm/GPU_sanity_check.ipynb b/jupyter-images/fall-2024/tm/GPU_sanity_check.ipynb new file mode 100644 index 00000000..287c3a45 --- /dev/null +++ b/jupyter-images/fall-2024/tm/GPU_sanity_check.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f1b605ba-8d44-4654-be99-df2d39289c36", + "metadata": {}, + "source": [ + "## GPU JHub Testing Notebook" + ] + }, + { + "cell_type": "markdown", + "id": "25cacd08-18b8-4991-966e-7e49aa44192a", + "metadata": {}, + "source": [ + "Notebook used for first pass testing of the environment and GPU access. Here are the various JS2 GPU instance [flavors](https://docs.jetstream-cloud.org/general/instance-flavors/#jetstream2-gpu)." + ] + }, + { + "cell_type": "markdown", + "id": "8f0798c9-ee1d-4157-a4be-deedd90bd9a5", + "metadata": {}, + "source": [ + "Note: this also tests PyTorch install, as of Novembeerr 2024, I hope to not use tensorflow for work at UCAR / Unidata. This entire notebook should run without any errors. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "831b1a5d-488d-476c-8050-4f18cd635c0c", + "metadata": {}, + "outputs": [], + "source": [ + "import psutil\n", + "import platform\n", + "import sys\n", + "\n", + "import torch\n", + "import platform" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "50c795b6-34a9-4c2b-ab71-67c5ea087fa2", + "metadata": {}, + "outputs": [], + "source": [ + "def get_simple_system_info():\n", + " # Memory info\n", + " memory = psutil.virtual_memory()\n", + " ram_gb = memory.total / (1024 ** 3) # Convert to GB\n", + " ram_used_gb = memory.used / (1024 ** 3)\n", + " \n", + " # CPU info\n", + " cpu_cores = psutil.cpu_count()\n", + " cpu_usage = psutil.cpu_percent(interval=1)\n", + " \n", + " print(f\"Python Version: {platform.python_version()}\")\n", + " print(f\"\\nCPU:\")\n", + " print(f\"- Cores: {cpu_cores}\")\n", + " print(f\"- Current Usage: {cpu_usage}%\")\n", + " print(f\"\\nRAM:\")\n", + " print(f\"- Total: {ram_gb:.1f} GB\")\n", + " print(f\"- Used: {ram_used_gb:.1f} GB\")\n", + " print(f\"- Usage: {memory.percent}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7ce932bc-9406-4841-91ca-371e3c768980", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python Version: 3.10.15\n", + "\n", + "CPU:\n", + "- Cores: 8\n", + "- Current Usage: 1.0%\n", + "\n", + "RAM:\n", + "- Total: 29.4 GB\n", + "- Used: 1.4 GB\n", + "- Usage: 6.1%\n" + ] + } + ], + "source": [ + "get_simple_system_info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9aa829ef-1093-4d6d-9052-a86a1a8647fc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Thu Nov 21 16:36:04 2024 \n", + "+---------------------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 |\n", + "|-----------------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|=========================================+======================+======================|\n", + "| 0 GRID A100X-10C On | 00000000:04:00.0 Off | 0 |\n", + "| N/A N/A P0 N/A / N/A | 0MiB / 10240MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-----------------------------------------+----------------------+----------------------+\n", + " \n", + "+---------------------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=======================================================================================|\n", + "| No running processes found |\n", + "+---------------------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0bb2ce0f-afba-4693-8072-bccb92dca0bf", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pytorch_info():\n", + " print(\"PyTorch System Information\")\n", + " print(\"-\" * 30)\n", + " \n", + " # PyTorch version\n", + " print(f\"PyTorch Version: {torch.__version__}\")\n", + " \n", + " # CUDA availability\n", + " print(f\"\\nCUDA Available: {torch.cuda.is_available()}\")\n", + " \n", + " if torch.cuda.is_available():\n", + " # Current device information\n", + " current_device = torch.cuda.current_device()\n", + " print(f\"Current CUDA Device: {current_device}\")\n", + " \n", + " # Device name\n", + " print(f\"Device Name: {torch.cuda.get_device_name(current_device)}\")\n", + " \n", + " # CUDA version\n", + " print(f\"CUDA Version: {torch.version.cuda}\")\n", + " \n", + " # Number of CUDA devices\n", + " print(f\"Device Count: {torch.cuda.device_count()}\")\n", + " \n", + " # Memory information\n", + " print(\"\\nGPU Memory Information:\")\n", + " print(f\"- Total: {torch.cuda.get_device_properties(current_device).total_memory / 1024**3:.2f} GB\")\n", + " print(f\"- Allocated: {torch.cuda.memory_allocated(current_device) / 1024**3:.2f} GB\")\n", + " print(f\"- Cached: {torch.cuda.memory_reserved(current_device) / 1024**3:.2f} GB\")\n", + " \n", + " # Architecture information\n", + " device_props = torch.cuda.get_device_properties(current_device)\n", + " print(f\"\\nGPU Architecture:\")\n", + " print(f\"- GPU Compute Capability: {device_props.major}.{device_props.minor}\")\n", + " print(f\"- Multi Processors: {device_props.multi_processor_count}\")\n", + " else:\n", + " print(\"\\nNo CUDA GPU available. PyTorch will run on CPU only.\")\n", + " print(f\"CPU Architecture: {platform.machine()}\")\n", + " print(f\"CPU Type: {platform.processor()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ea2e96f2-37fe-49a0-8cc8-d32a3b666a0a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyTorch System Information\n", + "------------------------------\n", + "PyTorch Version: 2.5.1+cu124\n", + "\n", + "CUDA Available: True\n", + "Current CUDA Device: 0\n", + "Device Name: GRID A100X-10C\n", + "CUDA Version: 12.4\n", + "Device Count: 1\n", + "\n", + "GPU Memory Information:\n", + "- Total: 10.00 GB\n", + "- Allocated: 0.00 GB\n", + "- Cached: 0.00 GB\n", + "\n", + "GPU Architecture:\n", + "- GPU Compute Capability: 8.0\n", + "- Multi Processors: 108\n" + ] + } + ], + "source": [ + "get_pytorch_info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b3f8dad5-6c3c-4a64-bc31-b9ef5d5894fb", + "metadata": {}, + "outputs": [], + "source": [ + "def get_instance_type():\n", + " cpu_count = psutil.cpu_count()\n", + " ram_gb = psutil.virtual_memory().total / (1024**3)\n", + " gpu_ram = 0\n", + " \n", + " if torch.cuda.is_available():\n", + " current_device = torch.cuda.current_device()\n", + " gpu_ram = torch.cuda.get_device_properties(current_device).total_memory / (1024**3)\n", + " \n", + " if cpu_count == 4 and 13 <= ram_gb <= 17 and 7 <= gpu_ram <= 9:\n", + " return \"g3.small\"\n", + " elif cpu_count == 8 and 28 <= ram_gb <= 32 and 9 <= gpu_ram <= 11:\n", + " return \"g3.medium\"\n", + " elif cpu_count == 16 and 58 <= ram_gb <= 62 and 19 <= gpu_ram <= 21:\n", + " return \"g3.large\"\n", + " elif cpu_count == 32 and 123 <= ram_gb <= 127 and 39 <= gpu_ram <= 41:\n", + " return \"g3.xl\"\n", + " else:\n", + " return \"custom\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ef30c0c2-bf93-4c7a-97e2-c80f7c503f20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'g3.medium'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_instance_type()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tm-fall-2024]", + "language": "python", + "name": "conda-env-tm-fall-2024-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter-images/fall-2024/tm/gpu/jupyterhub_gpu.yaml b/jupyter-images/fall-2024/tm/gpu/jupyterhub_gpu.yaml new file mode 100644 index 00000000..c8c312f7 --- /dev/null +++ b/jupyter-images/fall-2024/tm/gpu/jupyterhub_gpu.yaml @@ -0,0 +1,14 @@ +singleuser: + profileList: + - display_name: "GPU Server" + description: "Spawns a notebook server with access to a GPU" + kubespawner_override: + tolerations: + - key: 'gpu' + operator: 'Equal' + value: 'true' + effect: 'NoSchedule' + extra_resource_limits: + nvidia.com/gpu: "1" + - display_name: "CPU Server" + description: "Spawns a standard notebook server" diff --git a/jupyter-images/fall-2024/tm/secrets.yaml b/jupyter-images/fall-2024/tm/secrets.yaml index 9a2e82d3..fa8af79b 100644 --- a/jupyter-images/fall-2024/tm/secrets.yaml +++ b/jupyter-images/fall-2024/tm/secrets.yaml @@ -58,7 +58,7 @@ singleuser: - "-c" - > dir="/home/jovyan/.ssh"; [ -d $dir ] && { chmod 700 $dir && chmod -f 600 $dir/* && chmod -f 644 $dir/*.pub; } || true; - cp -t /home/jovyan /Acknowledgements.ipynb; + cp -t /home/jovyan /Acknowledgements.ipynb GPU_sanity_check.ipynb; [[ -f $HOME/.bashrc ]] || cp /etc/skel/.bashrc $HOME/; [[ -f $HOME/.profile ]] || cp /etc/skel/.profile $HOME/; [[ -f $HOME/.bash_logout ]] || cp /etc/skel/.bash_logout $HOME/; From 95df6f65fe4e91a73814bc0ecbd55407cb051c9f Mon Sep 17 00:00:00 2001 From: ana espinoza Date: Thu, 5 Dec 2024 08:58:54 -0700 Subject: [PATCH 5/9] Revert to standard env before changing --- jupyter-images/fall-2024/tm/environment.yml | 24 +++++++++------------ 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/jupyter-images/fall-2024/tm/environment.yml b/jupyter-images/fall-2024/tm/environment.yml index 314ef895..5f48d3e1 100644 --- a/jupyter-images/fall-2024/tm/environment.yml +++ b/jupyter-images/fall-2024/tm/environment.yml @@ -1,29 +1,25 @@ name: tm-fall-2024 channels: - - nvidia - - pytorch - - defaults - conda-forge dependencies: # Required by JupyterLab - - python=3.10 + - 'python<3.13' - nb_conda_kernels - ipykernel - - notebook - - jupyter_server # User requested packages - - cartopy - - cudatoolkit - - ipywidgets + - numpy - matplotlib + - cartopy - metpy - - numpy + - siphon - pandas - pip - - pytorch-cuda=12.1 + - xarray + - ipywidgets + - python-awips - scikit-learn - seaborn - - siphon - - xarray - pip: - - ai-models-fourcastnetv2 + # It is recommended to install a package using pip as a last resort, i.e. + # when it is not found in the conda repos + - palmerpenguins From a82fafd8d1db3152dd163722e1a7d11a7d4e14ff Mon Sep 17 00:00:00 2001 From: ana espinoza Date: Thu, 5 Dec 2024 10:32:04 -0700 Subject: [PATCH 6/9] Env: ai-models-graphcast and deps --- jupyter-images/fall-2024/tm/environment.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jupyter-images/fall-2024/tm/environment.yml b/jupyter-images/fall-2024/tm/environment.yml index 5f48d3e1..2cfa597e 100644 --- a/jupyter-images/fall-2024/tm/environment.yml +++ b/jupyter-images/fall-2024/tm/environment.yml @@ -22,4 +22,7 @@ dependencies: - pip: # It is recommended to install a package using pip as a last resort, i.e. # when it is not found in the conda repos - - palmerpenguins + # See: https://github.com/ecmwf-lab/ai-models-graphcast?tab=readme-ov-file#installation + - ai-models-graphcast + - jax[cuda12_pip] + - -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html From cf9feb6b167124064af3f27c8ff2d32d503700a2 Mon Sep 17 00:00:00 2001 From: ana espinoza Date: Thu, 5 Dec 2024 12:15:35 -0700 Subject: [PATCH 7/9] Add graphcast dep --- jupyter-images/fall-2024/tm/environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/jupyter-images/fall-2024/tm/environment.yml b/jupyter-images/fall-2024/tm/environment.yml index 2cfa597e..37c36eae 100644 --- a/jupyter-images/fall-2024/tm/environment.yml +++ b/jupyter-images/fall-2024/tm/environment.yml @@ -26,3 +26,4 @@ dependencies: - ai-models-graphcast - jax[cuda12_pip] - -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - git+https://github.com/deepmind/graphcast.git From 7364b7959863c8215d327aa7bc51bfebcacb9222 Mon Sep 17 00:00:00 2001 From: Julien Chastang Date: Thu, 19 Dec 2024 16:50:36 -0700 Subject: [PATCH 8/9] earth2mip --- jupyter-images/fall-2024/tm/Dockerfile | 2 +- jupyter-images/fall-2024/tm/environment.yml | 12 ++++++------ jupyter-images/fall-2024/tm/secrets.yaml | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/jupyter-images/fall-2024/tm/Dockerfile b/jupyter-images/fall-2024/tm/Dockerfile index 339cc9e4..86d1b814 100644 --- a/jupyter-images/fall-2024/tm/Dockerfile +++ b/jupyter-images/fall-2024/tm/Dockerfile @@ -1,7 +1,7 @@ # Heavily borrowed from docker-stacks/minimal-notebook/ # https://github.com/jupyter/docker-stacks/blob/main/minimal-notebook/Dockerfile -ARG BASE_CONTAINER=jupyter/minimal-notebook +ARG BASE_CONTAINER=jupyter/tensorflow-notebook FROM $BASE_CONTAINER ENV DEFAULT_ENV_NAME=tm-fall-2024 EDITOR=nano VISUAL=nano diff --git a/jupyter-images/fall-2024/tm/environment.yml b/jupyter-images/fall-2024/tm/environment.yml index 37c36eae..fdafc214 100644 --- a/jupyter-images/fall-2024/tm/environment.yml +++ b/jupyter-images/fall-2024/tm/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: # Required by JupyterLab - - 'python<3.13' + - 'python<3.11' - nb_conda_kernels - ipykernel # User requested packages @@ -22,8 +22,8 @@ dependencies: - pip: # It is recommended to install a package using pip as a last resort, i.e. # when it is not found in the conda repos - # See: https://github.com/ecmwf-lab/ai-models-graphcast?tab=readme-ov-file#installation - - ai-models-graphcast - - jax[cuda12_pip] - - -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - git+https://github.com/deepmind/graphcast.git + - git+https://github.com/NVIDIA/earth2mip + - torch + - torchvision + - torchaudio + - --extra-index-url https://download.pytorch.org/whl/cu121 diff --git a/jupyter-images/fall-2024/tm/secrets.yaml b/jupyter-images/fall-2024/tm/secrets.yaml index fa8af79b..e5f88f8b 100644 --- a/jupyter-images/fall-2024/tm/secrets.yaml +++ b/jupyter-images/fall-2024/tm/secrets.yaml @@ -64,3 +64,4 @@ singleuser: [[ -f $HOME/.bash_logout ]] || cp /etc/skel/.bash_logout $HOME/; [[ -f $HOME/.condarc ]] || cp /.condarc $HOME/; [ -d "/share" ] && [ ! -L ~/share ] && ln -s /share ~/share || true; + gitpuller https://github.com/nvidia/earth2mip main earth2mip; From 5e1f13996c15ab0f7c21eff0321c5d922bff47c8 Mon Sep 17 00:00:00 2001 From: ana espinoza Date: Fri, 7 Feb 2025 11:38:43 -0700 Subject: [PATCH 9/9] Minimal notebook works fine --- jupyter-images/fall-2024/tm/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter-images/fall-2024/tm/Dockerfile b/jupyter-images/fall-2024/tm/Dockerfile index 86d1b814..339cc9e4 100644 --- a/jupyter-images/fall-2024/tm/Dockerfile +++ b/jupyter-images/fall-2024/tm/Dockerfile @@ -1,7 +1,7 @@ # Heavily borrowed from docker-stacks/minimal-notebook/ # https://github.com/jupyter/docker-stacks/blob/main/minimal-notebook/Dockerfile -ARG BASE_CONTAINER=jupyter/tensorflow-notebook +ARG BASE_CONTAINER=jupyter/minimal-notebook FROM $BASE_CONTAINER ENV DEFAULT_ENV_NAME=tm-fall-2024 EDITOR=nano VISUAL=nano