From 2788c09ce72f6a1fcc74fb48b18b2164b4da2af9 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 25 Jan 2024 13:26:03 -0500 Subject: [PATCH 01/11] Add Zarr IO tutorial --- docs/gallery/advanced_io/zarr_io.py | 89 +++++++++++++++++++++++++++++ docs/source/conf.py | 2 + 2 files changed, 91 insertions(+) create mode 100644 docs/gallery/advanced_io/zarr_io.py diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py new file mode 100644 index 000000000..75f4f1237 --- /dev/null +++ b/docs/gallery/advanced_io/zarr_io.py @@ -0,0 +1,89 @@ +""" +Zarr IO +======= + +Zarr is an alternative backend option for NWB files. It is a Python package that +provides an implementation of chunked, compressed, N-dimensional arrays. Zarr is a good +option for large datasets because, like HDF5, it is designed to store data on disk and +only load the data into memory when needed. Zarr is also a good option for parallel +computing because it supports concurrent reads and writes. + +Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr` package. First, create an +an NWBFile using PyNWB. +""" + +from datetime import datetime +from dateutil.tz import tzlocal + +import numpy as np +from pynwb import NWBFile, TimeSeries + +# Create the NWBFile. Substitute your NWBFile generation here. +nwbfile = NWBFile( + session_description="my first synthetic recording", + identifier="EXAMPLE_ID", + session_start_time=datetime.now(tzlocal()), + session_id="LONELYMTN", +) + +####################################################################################### +# Dataset Configuration +# --------------------- +# Like HDF5, Zarr provides options to chunk and compress datasets. To leverage these +# features, replace all :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` with the analogous +# :py:class:`~hdmf_zarr.utils.ZarrDataIO`, which takes compressors specified by the +# `numcodecs` library. For example, to create a :py:class:`.TimeSeries` +# with a Zarr backend, use the following: + +from numcodecs import Blosc +from hdmf_zarr import ZarrDataIO + +data_with_zarr_data_io = ZarrDataIO( + data=np.random.randn(100, 100), + chunks=(10, 10), + fillvalue=0, + compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.SHUFFLE) +) + +####################################################################################### +# Now add it to the `NWBFile`. + +nwbfile.add_acquisition( + TimeSeries( + name="synthetic_timeseries", + data=data_with_zarr_data_io, + unit="m", + rate=10e3, + ) +) + +####################################################################################### +# Writing to Zarr +# --------------- +# To write NWB files to Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with +# :py:class:`hdmf_zarr.nwb.NWBZarrIO` for read/write + +from hdmf_zarr.nwb import NWBZarrIO +import os + +path = "zarr_tutorial.nwb.zarr" +absolute_path = os.path.abspath(path) +with NWBZarrIO(path=path, mode="w") as io: + io.write(nwbfile) + +####################################################################################### +# The main reason for using the absolute_path here is for testing purposes to ensure +# links and references work as expected. Otherwise, using the relative path here instead +# is fine. +# +# Reading from Zarr +# ----------------- +# To read NWB files from Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with the analogous +# :py:class:`hdmf_zarr.nwb.NWBZarrIO`. + +with NWBZarrIO(path=absolute_path, mode="r") as io: + read_nwbfile = io.read() + +####################################################################################### +# For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. + diff --git a/docs/source/conf.py b/docs/source/conf.py index 143d9d2c6..faf7d4a9b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -148,6 +148,7 @@ def __call__(self, filename): 'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None), 'nwbwidgets': ("https://nwb-widgets.readthedocs.io/en/latest/", None), 'nwb-overview': ("https://nwb-overview.readthedocs.io/en/latest/", None), + 'hdmf-zarr': ("https://hdmf-zarr.readthedocs.io/en/latest/", None), } extlinks = { @@ -159,6 +160,7 @@ def __call__(self, filename): 'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', '%s'), 'dandi': ('https://www.dandiarchive.org/%s', '%s'), "nwbinspector": ("https://nwbinspector.readthedocs.io/en/dev/%s", "%s"), + 'hdmf-zarr': ('https://hdmf-zarr.readthedocs.io/en/latest/%s', '%s'), } # Add any paths that contain templates here, relative to this directory. From 62e4050c4edc6490c31fa8acd52b007732c3176b Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Thu, 25 Jan 2024 13:27:52 -0500 Subject: [PATCH 02/11] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ddcdda7b..2251ba6e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ ### Documentation and tutorial enhancements - Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) - Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825) +- Add Zarr IO tutorial [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834) ## PyNWB 2.5.0 (August 18, 2023) From cb2a6c6c70fa0dc976ba2cc2c6e0a2fca2c13baa Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Thu, 25 Jan 2024 13:28:16 -0500 Subject: [PATCH 03/11] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2251ba6e5..c36666c7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ ### Documentation and tutorial enhancements - Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) - Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825) -- Add Zarr IO tutorial [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834) +- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834) ## PyNWB 2.5.0 (August 18, 2023) From b707c060105933c21e4ef1ff9020f6a0a3518b57 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 25 Jan 2024 17:10:36 -0500 Subject: [PATCH 04/11] add info, add thumbnail --- docs/gallery/advanced_io/zarr_io.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py index 75f4f1237..996f570b6 100644 --- a/docs/gallery/advanced_io/zarr_io.py +++ b/docs/gallery/advanced_io/zarr_io.py @@ -8,10 +8,18 @@ only load the data into memory when needed. Zarr is also a good option for parallel computing because it supports concurrent reads and writes. +Note that the Zarr native storage formats are optimized for storage in cloud storage +(e.g., S3). For very large files, Zarr will create many files which can lead to +issues for traditional file system (that are not cloud object stores) due to limitations +on the number of files per directory (this affects local disk, GDrive, Dropbox etc.). + Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr` package. First, create an an NWBFile using PyNWB. """ +# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png' + + from datetime import datetime from dateutil.tz import tzlocal @@ -85,5 +93,6 @@ read_nwbfile = io.read() ####################################################################################### -# For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. +# .. note:: +# For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. From 8d25eab5ba6317ed5deb029a54f0aa1f125ecc37 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Thu, 25 Jan 2024 22:58:51 -0800 Subject: [PATCH 05/11] Update docs/gallery/advanced_io/zarr_io.py --- docs/gallery/advanced_io/zarr_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py index 996f570b6..fcc790dcd 100644 --- a/docs/gallery/advanced_io/zarr_io.py +++ b/docs/gallery/advanced_io/zarr_io.py @@ -10,7 +10,7 @@ Note that the Zarr native storage formats are optimized for storage in cloud storage (e.g., S3). For very large files, Zarr will create many files which can lead to -issues for traditional file system (that are not cloud object stores) due to limitations +issues for traditional file systems (that are not cloud object stores) due to limitations on the number of files per directory (this affects local disk, GDrive, Dropbox etc.). Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr` package. First, create an From c6318b311e759fbcc0f99a3d0cadb35c4e3505f5 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Thu, 25 Jan 2024 23:01:11 -0800 Subject: [PATCH 06/11] Update docs/gallery/advanced_io/zarr_io.py --- docs/gallery/advanced_io/zarr_io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py index fcc790dcd..58dddb79e 100644 --- a/docs/gallery/advanced_io/zarr_io.py +++ b/docs/gallery/advanced_io/zarr_io.py @@ -10,8 +10,9 @@ Note that the Zarr native storage formats are optimized for storage in cloud storage (e.g., S3). For very large files, Zarr will create many files which can lead to -issues for traditional file systems (that are not cloud object stores) due to limitations -on the number of files per directory (this affects local disk, GDrive, Dropbox etc.). +issues for traditional file systems (that are not cloud object stores) due to +limitations on the number of files per directory (this affects local disk, +GDrive, Dropbox etc.). Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr` package. First, create an an NWBFile using PyNWB. From fd082e72a2894b8ead22c1baeff429fb9b940d04 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 26 Jan 2024 00:20:28 -0800 Subject: [PATCH 07/11] Update requirements-dev.txt --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index a19b50bd3..48ffaa443 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,6 +8,7 @@ coverage==7.2.2 flake8==6.0.0 flake8-debugger==4.1.2 flake8-print==5.0.0 +hdmf-zarr==0.5.0 # for zarr tutorial testing isort==5.12.0 pytest==7.1.2 pytest-cov==4.0.0 From b0de61de74fa8c336b4fc463609af5ac6d94d682 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 26 Jan 2024 00:21:13 -0800 Subject: [PATCH 08/11] Discard changes to requirements-dev.txt --- requirements-dev.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 48ffaa443..a19b50bd3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,7 +8,6 @@ coverage==7.2.2 flake8==6.0.0 flake8-debugger==4.1.2 flake8-print==5.0.0 -hdmf-zarr==0.5.0 # for zarr tutorial testing isort==5.12.0 pytest==7.1.2 pytest-cov==4.0.0 From 46e46f03a4a6f2d07f4b20c0a7b6156f6edb3ea5 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 26 Jan 2024 00:21:47 -0800 Subject: [PATCH 09/11] Update requirements-doc.txt --- requirements-doc.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-doc.txt b/requirements-doc.txt index 2050f4439..c37aee646 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,3 +12,4 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 +hdmf-zarr From 0105c55e754ebc8264c8e7e09c58936825e15d1c Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 26 Jan 2024 00:22:07 -0800 Subject: [PATCH 10/11] Update zarr_io.py --- docs/gallery/advanced_io/zarr_io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py index 58dddb79e..7226083a7 100644 --- a/docs/gallery/advanced_io/zarr_io.py +++ b/docs/gallery/advanced_io/zarr_io.py @@ -96,4 +96,3 @@ ####################################################################################### # .. note:: # For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. - From 588c96bb0a6ac594e0a8e5f0fe4ebfceb732db27 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 26 Jan 2024 00:23:55 -0800 Subject: [PATCH 11/11] Update docs/gallery/advanced_io/zarr_io.py --- docs/gallery/advanced_io/zarr_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py index 7226083a7..c6524b64d 100644 --- a/docs/gallery/advanced_io/zarr_io.py +++ b/docs/gallery/advanced_io/zarr_io.py @@ -14,7 +14,7 @@ limitations on the number of files per directory (this affects local disk, GDrive, Dropbox etc.). -Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr` package. First, create an +Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr package<>`. First, create an an NWBFile using PyNWB. """