diff --git a/ci/environment.yml b/ci/environment.yml index 581e275f..a7130683 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,8 +14,8 @@ dependencies: - packaging - universal_pathlib - hdf5plugin - - numcodecs>=0.15.1 - - icechunk>=0.1.1 + - numcodecs + - icechunk>=0.2.0 # Testing - codecov[toml] - pre-commit diff --git a/ci/upstream.yml b/ci/upstream.yml index c6c13d30..854e448b 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -30,6 +30,5 @@ dependencies: - pip - zarr>=3.0.2 - pip: - - git+https://github.com/earth-mover/icechunk.git@main#subdirectory=icechunk-python # Installs zarr-python v3.0.0 as dependency - git+https://github.com/fsspec/kerchunk.git@main - imagecodecs-numcodecs==2024.6.1 diff --git a/docs/contributing.md b/docs/contributing.md index 45fc8599..3c41e260 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -71,5 +71,5 @@ Anyone with commit privileges to the repository can issue a release, and you sho 8. Edit the draft release notes for consistency. 9. Select 'Publish' to publish the release. This should automatically upload the new release to [PyPI](https://pypi.org/project/virtualizarr/) and [conda-forge](https://anaconda.org/conda-forge/virtualizarr). 10. Check that this has run successfully (PyPI should show the new version number very quickly, but conda-forge might take several hours). -11. Create and merge a PR to add a new empty section to the `docs/releases.rst` for the next release in the future. +11. Create and merge a PR to add a new empty section to the `docs/releases.rst` for the next release in the future. See [this commit](https://github.com/zarr-developers/VirtualiZarr/commit/e3912f08e22f2e3230af6eb1a2aacb5728822fa1) for an example (you can assume the next release will be numbered `vX.Y.Z+1`, but the number doesn't actually matter). 12. (Optional) Advertise the release on social media 📣 diff --git a/docs/releases.rst b/docs/releases.rst index d9e31251..4ecade48 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -1,14 +1,36 @@ Release notes ============= +.. _v1.3.2: + +v1.3.2 (unreleased) +------------------- + +New Features +~~~~~~~~~~~~ + +Breaking changes +~~~~~~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +Documentation +~~~~~~~~~~~~~ + +Internal Changes +~~~~~~~~~~~~~~~~ .. _v1.3.1: -v1.3.1 (unreleased) +v1.3.1 (18th Feb 2025) ------------------- New Features ~~~~~~~~~~~~ - +- Examples use new Icechunk syntax Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/docs/usage.md b/docs/usage.md index b610beb3..af0199bd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -471,15 +471,15 @@ By default references are placed in separate parquet file when the total number ### Writing to an Icechunk Store -We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk ` accessor method. +We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is an open-source, cloud-native transactional tensor storage engine that is compatible with Zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk ` accessor method. ```python # create an icechunk repository, session and write the virtual dataset to the session -from icechunk import Repository, Storage, VirtualChunkContainer, local_filesystem_storage +import icechunk storage = local_filesystem_storage("./local/icechunk/store") # By default, local virtual references and public remote virtual references can be read wihtout extra configuration. -repo = Repository.create(storage=storage) +repo = icechunk.Repository.create(storage) session = repo.writeable_session("main") # write the virtual dataset to the session with the IcechunkStore @@ -500,6 +500,7 @@ session.commit("Appended second dataset") ``` See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details. +icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr ## Opening Kerchunk references as virtual datasets diff --git a/examples/append/noaa-cdr-sst.ipynb b/examples/append/noaa-cdr-sst.ipynb index 0f80b169..6e47defc 100644 --- a/examples/append/noaa-cdr-sst.ipynb +++ b/examples/append/noaa-cdr-sst.ipynb @@ -19,9 +19,7 @@ "metadata": {}, "outputs": [], "source": [ - "# !pip install -e \".[icechunk]\"\n", - "# !pip install git+https://github.com/mpiannucci/kerchunk@v3\n", - "# !pip install fsspec s3fs" + "# !pip install 'virtualizarr['icechunk','hdf']' ipykernel" ] }, { @@ -35,13 +33,13 @@ "output_type": "stream", "text": [ "Name: icechunk\n", - "Version: 0.1.0a7\n", - "Summary: Transactional storage engine for Zarr designed for use on cloud object storage\n", + "Version: 0.1.2\n", + "Summary: Icechunk Python\n", "Home-page: https://github.com/earth-mover/icechunk\n", "Author: Earthmover PBC\n", - "Author-email: \n", + "Author-email: Earthmover \n", "License: Apache-2.0\n", - "Location: /Users/aimeebarciauskas/github/virtualizarr/venv/lib/python3.12/site-packages\n", + "Location: /opt/homebrew/envs/virtualizarr-tests/lib/python3.12/site-packages\n", "Requires: zarr\n", "Required-by: \n" ] @@ -61,8 +59,8 @@ "import warnings\n", "\n", "import fsspec\n", + "import icechunk\n", "import xarray as xr\n", - "from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig\n", "\n", "from virtualizarr import open_virtual_dataset\n", "\n", @@ -137,6 +135,450 @@ { "cell_type": "code", "execution_count": 7, + "id": "c025f35d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 66MB\n",
+       "Dimensions:  (time: 2, zlev: 1, lat: 720, lon: 1440)\n",
+       "Coordinates:\n",
+       "    time     (time) float32 8B ManifestArray<shape=(2,), dtype=float32, chunk...\n",
+       "    zlev     (zlev) float32 4B ManifestArray<shape=(1,), dtype=float32, chunk...\n",
+       "    lat      (lat) float32 3kB ManifestArray<shape=(720,), dtype=float32, chu...\n",
+       "    lon      (lon) float32 6kB ManifestArray<shape=(1440,), dtype=float32, ch...\n",
+       "Data variables:\n",
+       "    sst      (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
+       "    anom     (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
+       "    err      (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
+       "    ice      (time, zlev, lat, lon) float64 17MB ManifestArray<shape=(2, 1, 7...\n",
+       "Attributes: (12/37)\n",
+       "    Conventions:                CF-1.6, ACDD-1.3\n",
+       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n",
+       "    references:                 Reynolds, et al.(2007) Daily High-Resolution-...\n",
+       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
+       "    id:                         oisst-avhrr-v02r01.20240801.nc\n",
+       "    naming_authority:           gov.noaa.ncei\n",
+       "    ...                         ...\n",
+       "    time_coverage_start:        2024-08-01T00:00:00Z\n",
+       "    time_coverage_end:          2024-08-01T23:59:59Z\n",
+       "    metadata_link:              https://doi.org/10.25921/RE9P-PT57\n",
+       "    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0\n",
+       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
+       "    sensor:                     Thermometer, AVHRR
" + ], + "text/plain": [ + " Size: 66MB\n", + "Dimensions: (time: 2, zlev: 1, lat: 720, lon: 1440)\n", + "Coordinates:\n", + " time (time) float32 8B ManifestArray span {\n", @@ -445,15 +887,15 @@ "}\n", "\n", ".xr-dim-list:before {\n", - " content: '(';\n", + " content: \"(\";\n", "}\n", "\n", ".xr-dim-list:after {\n", - " content: ')';\n", + " content: \")\";\n", "}\n", "\n", ".xr-dim-list li:not(:last-child):after {\n", - " content: ',';\n", + " content: \",\";\n", " padding-right: 5px;\n", "}\n", "\n", @@ -604,17 +1046,17 @@ " fill: currentColor;\n", "}\n", "
<xarray.Dataset> Size: 66MB\n",
-       "Dimensions:  (time: 2, zlev: 1, lat: 720, lon: 1440)\n",
+       "Dimensions:  (lon: 1440, time: 2, zlev: 1, lat: 720)\n",
        "Coordinates:\n",
+       "  * lon      (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n",
        "  * time     (time) datetime64[ns] 16B 2024-08-01T12:00:00 2024-08-02T12:00:00\n",
-       "  * zlev     (zlev) float32 4B 0.0\n",
        "  * lat      (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
-       "  * lon      (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n",
+       "  * zlev     (zlev) float32 4B 0.0\n",
        "Data variables:\n",
-       "    err      (time, zlev, lat, lon) float64 17MB ...\n",
-       "    anom     (time, zlev, lat, lon) float64 17MB ...\n",
-       "    ice      (time, zlev, lat, lon) float64 17MB ...\n",
-       "    sst      (time, zlev, lat, lon) float64 17MB ...\n",
+       "    sst      (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
+       "    ice      (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
+       "    anom     (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
+       "    err      (time, zlev, lat, lon) float64 17MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
        "Attributes: (12/37)\n",
        "    Conventions:                CF-1.6, ACDD-1.3\n",
        "    cdm_data_type:              Grid\n",
@@ -628,34 +1070,414 @@
        "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
        "    time_coverage_end:          2024-08-01T23:59:59Z\n",
        "    time_coverage_start:        2024-08-01T00:00:00Z\n",
-       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
  • Conventions :
    CF-1.6, ACDD-1.3
    cdm_data_type :
    Grid
    comment :
    Data was converted from NetCDF-3 to NetCDF-4 format with metadata updates in November 2017.
    creator_email :
    oisst-help@noaa.gov
    creator_url :
    https://www.ncei.noaa.gov/
    date_created :
    2024-08-16T09:12:00Z
    date_modified :
    2024-08-16T09:12:00Z
    geospatial_lat_max :
    90.0
    geospatial_lat_min :
    -90.0
    geospatial_lat_resolution :
    0.25
    geospatial_lat_units :
    degrees_north
    geospatial_lon_max :
    360.0
    geospatial_lon_min :
    0.0
    geospatial_lon_resolution :
    0.25
    geospatial_lon_units :
    degrees_east
    history :
    Final file created using preliminary as first guess, and 3 days of AVHRR data. Preliminary uses only 1 day of AVHRR data.
    id :
    oisst-avhrr-v02r01.20240801.nc
    institution :
    NOAA/National Centers for Environmental Information
    instrument :
    Earth Remote Sensing Instruments > Passive Remote Sensing > Spectrometers/Radiometers > Imaging Spectrometers/Radiometers > AVHRR > Advanced Very High Resolution Radiometer
    instrument_vocabulary :
    Global Change Master Directory (GCMD) Instrument Keywords
    keywords :
    Earth Science > Oceans > Ocean Temperature > Sea Surface Temperature
    keywords_vocabulary :
    Global Change Master Directory (GCMD) Earth Science Keywords
    metadata_link :
    https://doi.org/10.25921/RE9P-PT57
    naming_authority :
    gov.noaa.ncei
    ncei_template_version :
    NCEI_NetCDF_Grid_Template_v2.0
    platform :
    Ships, buoys, Argo floats, MetOp-A, MetOp-B
    platform_vocabulary :
    Global Change Master Directory (GCMD) Platform Keywords
    processing_level :
    NOAA Level 4
    product_version :
    Version v02r01
    references :
    Reynolds, et al.(2007) Daily High-Resolution-Blended Analyses for Sea Surface Temperature (available at https://doi.org/10.1175/2007JCLI1824.1). Banzon, et al.(2016) A long-term record of blended satellite and in situ sea-surface temperature for climate monitoring, modeling and environmental studies (available at https://doi.org/10.5194/essd-8-165-2016). Huang et al. (2020) Improvements of the Daily Optimum Interpolation Sea Surface Temperature (DOISST) Version v02r01, submitted.Climatology is based on 1971-2000 OI.v2 SST. Satellite data: Pathfinder AVHRR SST, Navy AVHRR SST, and NOAA ACSPO SST. Ice data: NCEP Ice and GSFC Ice.
    sensor :
    Thermometer, AVHRR
    source :
    ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfinder_AVHRR, Navy_AVHRR, NOAA_ACSP
    standard_name_vocabulary :
    CF Standard Name Table (v40, 25 January 2017)
    summary :
    NOAAs 1/4-degree Daily Optimum Interpolation Sea Surface Temperature (OISST) (sometimes referred to as Reynolds SST, which however also refers to earlier products at different resolution), currently available as version v02r01, is created by interpolating and extrapolating SST observations from different sources, resulting in a smoothed complete field. The sources of data are satellite (AVHRR) and in situ platforms (i.e., ships and buoys), and the specific datasets employed may change over time. At the marginal ice zone, sea ice concentrations are used to generate proxy SSTs. A preliminary version of this file is produced in near-real time (1-day latency), and then replaced with a final version after 2 weeks. Note that this is the AVHRR-ONLY DOISST, available from Oct 1981, but there is a companion DOISST product that includes microwave satellite data, available from June 2002
    time_coverage_end :
    2024-08-01T23:59:59Z
    time_coverage_start :
    2024-08-01T00:00:00Z
    title :
    NOAA/NCEI 1/4 Degree Daily Optimum Interpolation Sea Surface Temperature (OISST) Analysis, Version 2.1 - Final
  • " ], "text/plain": [ " Size: 66MB\n", - "Dimensions: (time: 2, zlev: 1, lat: 720, lon: 1440)\n", + "Dimensions: (lon: 1440, time: 2, zlev: 1, lat: 720)\n", "Coordinates:\n", + " * lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n", " * time (time) datetime64[ns] 16B 2024-08-01T12:00:00 2024-08-02T12:00:00\n", - " * zlev (zlev) float32 4B 0.0\n", " * lat (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", - " * lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n", + " * zlev (zlev) float32 4B 0.0\n", "Data variables:\n", - " err (time, zlev, lat, lon) float64 17MB ...\n", - " anom (time, zlev, lat, lon) float64 17MB ...\n", - " ice (time, zlev, lat, lon) float64 17MB ...\n", - " sst (time, zlev, lat, lon) float64 17MB ...\n", + " sst (time, zlev, lat, lon) float64 17MB dask.array\n", + " ice (time, zlev, lat, lon) float64 17MB dask.array\n", + " anom (time, zlev, lat, lon) float64 17MB dask.array\n", + " err (time, zlev, lat, lon) float64 17MB dask.array\n", "Attributes: (12/37)\n", " Conventions: CF-1.6, ACDD-1.3\n", " cdm_data_type: Grid\n", @@ -678,7 +1500,7 @@ } ], "source": [ - "ds = xr.open_zarr(store, consolidated=False, zarr_format=3)\n", + "ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)\n", "ds" ] }, @@ -730,9 +1552,7 @@ "metadata": {}, "outputs": [], "source": [ - "append_store = IcechunkStore.open_existing(\n", - " storage=storage_config, config=virtual_ref_store_config, read_only=False\n", - ")" + "append_session = repo.writable_session(\"main\")" ] }, { @@ -742,7 +1562,7 @@ "metadata": {}, "outputs": [], "source": [ - "virtual_ds_a.virtualize.to_icechunk(append_store, append_dim=\"time\")" + "virtual_ds_a.virtualize.to_icechunk(append_session.store, append_dim=\"time\")" ] }, { @@ -754,7 +1574,7 @@ { "data": { "text/plain": [ - "'0HE5RZ869HTG8RZESHCG'" + "'3MEW3ECB74ZYANAZZHT0'" ] }, "execution_count": 17, @@ -763,7 +1583,7 @@ } ], "source": [ - "append_store.commit(\"wrote 2 more days of data\")" + "append_session.commit(\"wrote 2 more days of data\")" ] }, { @@ -781,9 +1601,7 @@ "metadata": {}, "outputs": [], "source": [ - "read_store = IcechunkStore.open_existing(\n", - " storage=storage_config, config=virtual_ref_store_config, read_only=True\n", - ")" + "read_session = repo.readonly_session(branch=\"main\")" ] }, { @@ -825,14 +1643,14 @@ " --xr-background-color-row-odd: var(--jp-layout-color2, #eeeeee);\n", "}\n", "\n", - "html[theme=dark],\n", - "html[data-theme=dark],\n", - "body[data-theme=dark],\n", + "html[theme=\"dark\"],\n", + "html[data-theme=\"dark\"],\n", + "body[data-theme=\"dark\"],\n", "body.vscode-dark {\n", " --xr-font-color0: rgba(255, 255, 255, 1);\n", " --xr-font-color2: rgba(255, 255, 255, 0.54);\n", " --xr-font-color3: rgba(255, 255, 255, 0.38);\n", - " --xr-border-color: #1F1F1F;\n", + " --xr-border-color: #1f1f1f;\n", " --xr-disabled-color: #515151;\n", " --xr-background-color: #111111;\n", " --xr-background-color-row-even: #111111;\n", @@ -887,6 +1705,7 @@ ".xr-section-item input {\n", " display: inline-block;\n", " opacity: 0;\n", + " height: 0;\n", "}\n", "\n", ".xr-section-item input + label {\n", @@ -923,7 +1742,7 @@ "\n", ".xr-section-summary-in + label:before {\n", " display: inline-block;\n", - " content: 'â–º';\n", + " content: \"â–º\";\n", " font-size: 11px;\n", " width: 15px;\n", " text-align: center;\n", @@ -934,7 +1753,7 @@ "}\n", "\n", ".xr-section-summary-in:checked + label:before {\n", - " content: 'â–¼';\n", + " content: \"â–¼\";\n", "}\n", "\n", ".xr-section-summary-in:checked + label > span {\n", @@ -1006,15 +1825,15 @@ "}\n", "\n", ".xr-dim-list:before {\n", - " content: '(';\n", + " content: \"(\";\n", "}\n", "\n", ".xr-dim-list:after {\n", - " content: ')';\n", + " content: \")\";\n", "}\n", "\n", ".xr-dim-list li:not(:last-child):after {\n", - " content: ',';\n", + " content: \",\";\n", " padding-right: 5px;\n", "}\n", "\n", @@ -1165,17 +1984,17 @@ " fill: currentColor;\n", "}\n", "
    <xarray.Dataset> Size: 133MB\n",
    -       "Dimensions:  (time: 4, zlev: 1, lat: 720, lon: 1440)\n",
    +       "Dimensions:  (zlev: 1, time: 4, lat: 720, lon: 1440)\n",
            "Coordinates:\n",
    +       "  * zlev     (zlev) float32 4B 0.0\n",
            "  * lat      (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
            "  * time     (time) datetime64[ns] 32B 2024-08-01T12:00:00 ... 2024-08-04T12:...\n",
    -       "  * zlev     (zlev) float32 4B 0.0\n",
            "  * lon      (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n",
            "Data variables:\n",
    -       "    anom     (time, zlev, lat, lon) float64 33MB ...\n",
    -       "    ice      (time, zlev, lat, lon) float64 33MB ...\n",
    -       "    err      (time, zlev, lat, lon) float64 33MB ...\n",
    -       "    sst      (time, zlev, lat, lon) float64 33MB ...\n",
    +       "    ice      (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
    +       "    err      (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
    +       "    sst      (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
    +       "    anom     (time, zlev, lat, lon) float64 33MB dask.array<chunksize=(1, 1, 720, 1440), meta=np.ndarray>\n",
            "Attributes: (12/37)\n",
            "    Conventions:                CF-1.6, ACDD-1.3\n",
            "    cdm_data_type:              Grid\n",
    @@ -1189,37 +2008,425 @@
            "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
            "    time_coverage_end:          2024-08-03T23:59:59Z\n",
            "    time_coverage_start:        2024-08-03T00:00:00Z\n",
    -       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
  • Conventions :
    CF-1.6, ACDD-1.3
    cdm_data_type :
    Grid
    comment :
    Data was converted from NetCDF-3 to NetCDF-4 format with metadata updates in November 2017.
    creator_email :
    oisst-help@noaa.gov
    creator_url :
    https://www.ncei.noaa.gov/
    date_created :
    2024-08-18T09:12:00Z
    date_modified :
    2024-08-18T09:12:00Z
    geospatial_lat_max :
    90.0
    geospatial_lat_min :
    -90.0
    geospatial_lat_resolution :
    0.25
    geospatial_lat_units :
    degrees_north
    geospatial_lon_max :
    360.0
    geospatial_lon_min :
    0.0
    geospatial_lon_resolution :
    0.25
    geospatial_lon_units :
    degrees_east
    history :
    Final file created using preliminary as first guess, and 3 days of AVHRR data. Preliminary uses only 1 day of AVHRR data.
    id :
    oisst-avhrr-v02r01.20240803.nc
    institution :
    NOAA/National Centers for Environmental Information
    instrument :
    Earth Remote Sensing Instruments > Passive Remote Sensing > Spectrometers/Radiometers > Imaging Spectrometers/Radiometers > AVHRR > Advanced Very High Resolution Radiometer
    instrument_vocabulary :
    Global Change Master Directory (GCMD) Instrument Keywords
    keywords :
    Earth Science > Oceans > Ocean Temperature > Sea Surface Temperature
    keywords_vocabulary :
    Global Change Master Directory (GCMD) Earth Science Keywords
    metadata_link :
    https://doi.org/10.25921/RE9P-PT57
    naming_authority :
    gov.noaa.ncei
    ncei_template_version :
    NCEI_NetCDF_Grid_Template_v2.0
    platform :
    Ships, buoys, Argo floats, MetOp-A, MetOp-B
    platform_vocabulary :
    Global Change Master Directory (GCMD) Platform Keywords
    processing_level :
    NOAA Level 4
    product_version :
    Version v02r01
    references :
    Reynolds, et al.(2007) Daily High-Resolution-Blended Analyses for Sea Surface Temperature (available at https://doi.org/10.1175/2007JCLI1824.1). Banzon, et al.(2016) A long-term record of blended satellite and in situ sea-surface temperature for climate monitoring, modeling and environmental studies (available at https://doi.org/10.5194/essd-8-165-2016). Huang et al. (2020) Improvements of the Daily Optimum Interpolation Sea Surface Temperature (DOISST) Version v02r01, submitted.Climatology is based on 1971-2000 OI.v2 SST. Satellite data: Pathfinder AVHRR SST, Navy AVHRR SST, and NOAA ACSPO SST. Ice data: NCEP Ice and GSFC Ice.
    sensor :
    Thermometer, AVHRR
    source :
    ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfinder_AVHRR, Navy_AVHRR, NOAA_ACSP
    standard_name_vocabulary :
    CF Standard Name Table (v40, 25 January 2017)
    summary :
    NOAAs 1/4-degree Daily Optimum Interpolation Sea Surface Temperature (OISST) (sometimes referred to as Reynolds SST, which however also refers to earlier products at different resolution), currently available as version v02r01, is created by interpolating and extrapolating SST observations from different sources, resulting in a smoothed complete field. The sources of data are satellite (AVHRR) and in situ platforms (i.e., ships and buoys), and the specific datasets employed may change over time. At the marginal ice zone, sea ice concentrations are used to generate proxy SSTs. A preliminary version of this file is produced in near-real time (1-day latency), and then replaced with a final version after 2 weeks. Note that this is the AVHRR-ONLY DOISST, available from Oct 1981, but there is a companion DOISST product that includes microwave satellite data, available from June 2002
    time_coverage_end :
    2024-08-03T23:59:59Z
    time_coverage_start :
    2024-08-03T00:00:00Z
    title :
    NOAA/NCEI 1/4 Degree Daily Optimum Interpolation Sea Surface Temperature (OISST) Analysis, Version 2.1 - Final
  • " ], "text/plain": [ " Size: 133MB\n", - "Dimensions: (time: 4, zlev: 1, lat: 720, lon: 1440)\n", + "Dimensions: (zlev: 1, time: 4, lat: 720, lon: 1440)\n", "Coordinates:\n", + " * zlev (zlev) float32 4B 0.0\n", " * lat (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", " * time (time) datetime64[ns] 32B 2024-08-01T12:00:00 ... 2024-08-04T12:...\n", - " * zlev (zlev) float32 4B 0.0\n", " * lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9\n", "Data variables:\n", - " anom (time, zlev, lat, lon) float64 33MB ...\n", - " ice (time, zlev, lat, lon) float64 33MB ...\n", - " err (time, zlev, lat, lon) float64 33MB ...\n", - " sst (time, zlev, lat, lon) float64 33MB ...\n", + " ice (time, zlev, lat, lon) float64 33MB dask.array\n", + " err (time, zlev, lat, lon) float64 33MB dask.array\n", + " sst (time, zlev, lat, lon) float64 33MB dask.array\n", + " anom (time, zlev, lat, lon) float64 33MB dask.array\n", "Attributes: (12/37)\n", " Conventions: CF-1.6, ACDD-1.3\n", " cdm_data_type: Grid\n", @@ -1242,24 +2449,16 @@ } ], "source": [ - "ds = xr.open_zarr(read_store, consolidated=False, zarr_format=3)\n", + "ds = xr.open_zarr(read_session.store, consolidated=False, zarr_format=3)\n", "ds" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41808f96", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "virtualizarr", + "display_name": "Python 3", "language": "python", - "name": "venv" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1271,7 +2470,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/examples/coiled/terraclimate.ipynb b/examples/coiled/terraclimate.ipynb index 205f094b..3af2dbd1 100644 --- a/examples/coiled/terraclimate.ipynb +++ b/examples/coiled/terraclimate.ipynb @@ -35,7 +35,7 @@ "You should install the Python requirements in a clean virtual environment of your choice. Each coiled serverless function will re-use this environment, so it's best to start with a clean slate.\n", "\n", "```bash\n", - "pip install virtualizarr coiled xarray fastparquet ipykernel\n", + "pip install 'virtualizarr['icechunk','hdf']' coiled ipykernel bokeh\n", "```" ] }, @@ -48,11 +48,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import coiled\n", + "import icechunk\n", "import numpy as np\n", "import xarray as xr\n", "\n", @@ -72,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -133,7 +134,7 @@ " spot_policy=\"spot_with_fallback\",\n", " arm=True,\n", " idle_timeout=\"10 minutes\",\n", - " n_workers=[100, 300],\n", + " n_workers=[10, 100],\n", " name=\"parallel_reference_generation\",\n", ")\n", "def process(filename):\n", @@ -141,15 +142,13 @@ " filename,\n", " decode_times=True,\n", " loadable_variables=[\"time\", \"lat\", \"lon\", \"crs\"],\n", - " filetype=\"netcdf4\",\n", - " indexes={},\n", " )\n", " return vds\n", "\n", "\n", "# process.map distributes out the input file urls to coiled functions\n", "# retires=10 allows for individual task retires, which can be useful for inconsistent server behavior\n", - "results = process.map(combinations, retries=10)" + "results = process.map(combinations[0:2], retries=10)" ] }, { @@ -171,8 +170,9 @@ "\n", "# combine individual refs into a virtual Xarray dataset\n", "mds = xr.combine_by_coords(\n", - " vds_list, coords=\"minimal\", compat=\"override\", combine_attrs=\"drop_conflicts\"\n", + " vds_list, coords=\"minimal\", compat=\"override\", combine_attrs=\"drop\"\n", ")\n", + "\n", "mds" ] }, @@ -189,9 +189,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Save the reference to disk\n", + "## Save the virtual dataset to Icechunk\n", "\n", - "Now that we have this virtual dataset, we can save the combined reference file for future use. The resulting reference parquet file is only 2.6MB!\n" + "Now that we have this virtual dataset, we can write it to Icechunk. \n", + "\n", + "In this example we're creating a local icechunk store, but you could configure it for cloud storage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "local_storage_conifg = icechunk.local_filesystem_storage(\"./terraclimate\")\n", + "repo = icechunk.Repository.open_or_create(local_storage_conifg)\n", + "session = repo.writable_session(\"main\")" ] }, { @@ -200,15 +213,14 @@ "metadata": {}, "outputs": [], "source": [ - "mds.virtualize.to_kerchunk(\"terraclimate.parquet\", format=\"parquet\")" + "mds.virtualize.to_icechunk(store=session.store)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Open the reference file and load into Xarray\n", - "You can now open up the reference file with Xarray and Kerchunk. This will now behave similarly to a normal Xarray dataset. \n", + "## Open the Icechunk store with Xarray\n", "\n", "**Warning:** Calling `to_zarr` on this dataset will try to write out 1TB of data.\n" ] @@ -219,7 +231,15 @@ "metadata": {}, "outputs": [], "source": [ - "combined_ds = xr.open_dataset(\"terraclimate.parquet\", engine=\"kerchunk\", chunks={})\n", + "combined_ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "combined_ds" ] } @@ -240,7 +260,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index b68c919b..f6784732 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ all_readers = [ # writers icechunk = [ - "icechunk>=0.1.1", + "icechunk>=0.2.0", ] kerchunk = ["fastparquet"] all_writers = [ diff --git a/virtualizarr/vendor/__init__.py b/virtualizarr/vendor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 1c044465..89eb015f 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -304,7 +304,10 @@ def write_manifest_virtual_refs( ) -> None: """Write all the virtual references for one array manifest at once.""" - key_prefix = f"{group.name}/{arr_name}" + if group.name == "/": + key_prefix = arr_name + else: + key_prefix = f"{group.name}/{arr_name}" # loop over every reference in the ChunkManifest for that array # TODO inefficient: this should be replaced with something that sets all (new) references for the array at once