diff --git a/icechunk-python/notebooks/era5_xarray.ipynb b/icechunk-python/notebooks/era5_xarray.ipynb new file mode 100644 index 00000000..ee9e73e6 --- /dev/null +++ b/icechunk-python/notebooks/era5_xarray.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "40c929a3-87d4-4c0e-a97d-1300d8adcae0", + "metadata": {}, + "source": [ + "# Use Icechunk with Xarray on ERA5 Data\n", + "\n", + "In this demo we will use Icechunk with Xarray to read and write ERA5 data.\n", + "\n", + "We will start from some data from the [NCAR ERA5 AWS Public Dataset](https://nsf-ncar-era5.s3.amazonaws.com/index.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2904d5f-090b-4344-a2f7-99096ba26d27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray: 0.5.2.dev4594+gd9d6fee2\n", + "dask: 0+untagged.8339.gc0f671e\n", + "zarr: 3.0.0a7\n", + "icechunk: 0.1.0-alpha.1\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import zarr\n", + "import dask\n", + "import fsspec\n", + "import h5py\n", + "from dask.diagnostics import ProgressBar\n", + "\n", + "import icechunk\n", + "from icechunk import IcechunkStore, StorageConfig\n", + "\n", + "print('xarray: ', xr.__version__)\n", + "print('dask: ', dask.__version__)\n", + "print('zarr: ', zarr.__version__)\n", + "print('icechunk:', icechunk.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "081e1a71-873e-45c3-b77d-5b7aa1617286", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 284 ms, sys: 39.2 ms, total: 323 ms\n", + "Wall time: 2.18 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/srv/conda/envs/icechunk/lib/python3.12/site-packages/xarray/backends/api.py:357: UserWarning: The specified chunks separate the stored chunks along dimension \"time\" starting at index 1. This could degrade performance. Instead, consider rechunking after loading.\n", + " var_chunks = _get_chunk(var, chunks, chunkmanager)\n" + ] + } + ], + "source": [ + "url = \"https://nsf-ncar-era5.s3.amazonaws.com/e5.oper.an.pl/194106/e5.oper.an.pl.128_060_pv.ll025sc.1941060100_1941060123.nc\"\n", + "%time ds = xr.open_dataset(fsspec.open(url).open(), engine=\"h5netcdf\", chunks={\"time\": 1})\n", + "ds = ds.drop_encoding()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b3048527-c50f-451c-9500-cac6c22dd1bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size: 4GB\n", + "Dimensions: (time: 24, level: 37, latitude: 721, longitude: 1440)\n", + "Coordinates:\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB dask.array\n", + " utc_date (time) int32 96B dask.array\n", + "Attributes:\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " Conventions: CF-1.6\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n" + ] + } + ], + "source": [ + "print(ds)" + ] + }, + { + "cell_type": "markdown", + "id": "7f4a801c-b570-45e3-b37f-2e140a2fb273", + "metadata": {}, + "source": [ + "### Load Data from HDF5 File\n", + "\n", + "This illustrates how loading directly from HDF5 files on S3 can be slow, even with Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "29e344c6-a25e-4342-979f-d2d2c7aed7a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 75.10 ss\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " dsl = ds.load()" + ] + }, + { + "cell_type": "markdown", + "id": "bdbd3f6c-e62c-4cfc-8cfb-b0fa22b6bddd", + "metadata": {}, + "source": [ + "### Initialize Icechunk Repo" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9283b1f5-a0e9-43ef-bd8a-5985bedc2d17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store = await IcechunkStore.create(\n", + " storage=StorageConfig.s3_from_env(\n", + " bucket=\"icechunk-test\",\n", + " prefix=\"ryan/icechunk-tests-era5-4\"),\n", + " mode=\"w\"\n", + ")\n", + "store" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b13b469d-45d7-4844-b153-b44d274cb220", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('main', 'NFT4V6D9HTP7YYVPFQCG')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.branch, store.snapshot_id" + ] + }, + { + "cell_type": "markdown", + "id": "12c4ce5a-f1dd-4576-9d89-071583cd92a4", + "metadata": {}, + "source": [ + "### Store Data To Icechunk\n", + "\n", + "We specify encoding to set both compression and chunk size." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "67c6389d-79a0-4992-b845-6a633cb4d86b", + "metadata": {}, + "outputs": [], + "source": [ + "encoding = {\n", + " \"PV\": {\n", + " \"codecs\": [zarr.codecs.BytesCodec(), zarr.codecs.ZstdCodec()],\n", + " \"chunks\": (1, 1, 721, 1440)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4e632068-fb29-4a6f-a3d0-d19edb8f68a2", + "metadata": {}, + "source": [ + "Note that Dask is not required to obtain good performance when reading and writing. Zarr and Icechunk use multithreading and asyncio internally." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b9a8c5ab-cc5a-4a05-b4ba-3b52be187e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 41.9 s, sys: 1.53 s, total: 43.4 s\n", + "Wall time: 20.5 s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dsl.to_zarr(store, zarr_format=3, consolidated=False, encoding=encoding)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b6b19d8b-3655-4213-99c9-5857c2ac126b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'FVSG2XNYSF0475HP8DKG'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "await store.commit(\"wrote data\")" + ] + }, + { + "cell_type": "markdown", + "id": "34b1a12c-9640-4f8b-a5fc-2ade040b437c", + "metadata": {}, + "source": [ + "### Read Data Back" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a9c1bfc7-61d2-4a92-ab82-b026e7b9fcf6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 14.8 ms, sys: 428 μs, total: 15.3 ms\n", + "Wall time: 50.6 ms\n" + ] + } + ], + "source": [ + "%time dsic = xr.open_dataset(store, consolidated=False, engine=\"zarr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c09243a3-9965-4952-a7af-21f4e95697b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size: 4GB\n", + "Dimensions: (longitude: 1440, time: 24, latitude: 721, level: 37)\n", + "Coordinates:\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB ...\n", + " utc_date (time) int32 96B ...\n", + "Attributes:\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " Conventions: CF-1.6\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n" + ] + } + ], + "source": [ + "print(dsic)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "feb23457-c6fe-4363-8393-c92ab1ae7a89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 9.18 ms, sys: 7.07 ms, total: 16.2 ms\n", + "Wall time: 79.6 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array(0.00710905, dtype=float32)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dsic.PV[0, 0, 0, 0].values" + ] + }, + { + "cell_type": "markdown", + "id": "2eef8e3a-c0ce-4383-b76a-e852a50f7398", + "metadata": {}, + "source": [ + "As with writing, Dask is not required for performant reading of the data.\n", + "In this example we can load the entire dataset (nearly 4GB) in 8s. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d5103624-554c-4d18-a323-d24f82b99818", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.08 s, sys: 1.93 s, total: 10 s\n", + "Wall time: 9.65 s\n" + ] + } + ], + "source": [ + "%time _ = dsic.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:icechunk]", + "language": "python", + "name": "conda-env-icechunk-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}