From 6739cf32c798f9423363fd446c0387f2ad4b05c1 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 11:55:58 -0700 Subject: [PATCH 01/18] refactor streaming tutorial to expose Python code --- docs/gallery/advanced_io/streaming.py | 212 +++++++++++++------------- 1 file changed, 105 insertions(+), 107 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index b3800584a..bc90c96f2 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -23,113 +23,111 @@ Now you can get the url of a particular NWB file using the dandiset ID and the path of that file within the dandiset. -.. code-block:: python - - from dandi.dandiapi import DandiAPIClient - - dandiset_id = '000006' # ephys dataset from the Svoboda Lab - filepath = 'sub-anm372795/sub-anm372795_ses-20170718.nwb' # 450 kB file - with DandiAPIClient() as client: - asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath) - s3_url = asset.get_content_url(follow_redirects=1, strip_query=True) - - -Streaming Method 1: fsspec --------------------------- -fsspec is another data streaming approach that is quite flexible and has several performance advantages. This library -creates a virtual filesystem for remote stores. With this approach, a virtual file is created for the file and -the virtual filesystem layer takes care of requesting data from the S3 bucket whenever data is -read from the virtual file. Note that this implementation is completely unaware of internals of the HDF5 format -and thus can work for **any** file, not only for the purpose of use with H5PY and PyNWB. - -First install ``fsspec`` and the dependencies of the :py:class:`~fsspec.implementations.http.HTTPFileSystem`: - -.. code-block:: bash - - pip install fsspec requests aiohttp - -Then in Python: - -.. code-block:: python - - import fsspec - import pynwb - import h5py - from fsspec.implementations.cached import CachingFileSystem - - # first, create a virtual filesystem based on the http protocol - fs=fsspec.filesystem("http") - - # create a cache to save downloaded data to disk (optional) - fs = CachingFileSystem( - fs=fs, - cache_storage="nwb-cache", # Local folder for the cache - ) - - # next, open the file - with fs.open(s3_url, "rb") as f: - with h5py.File(f) as file: - with pynwb.NWBHDF5IO(file=file, load_namespaces=True) as io: - nwbfile = io.read() - print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) - - -fsspec is a library that can be used to access a variety of different store formats, including (at the time of -writing): - -.. code-block:: python - - from fsspec.registry import known_implementations - known_implementations.keys() - -file, memory, dropbox, http, https, zip, tar, gcs, gs, gdrive, sftp, ssh, ftp, hdfs, arrow_hdfs, webhdfs, s3, s3a, wandb, oci, adl, abfs, az, cached, blockcache, filecache, simplecache, dask, dbfs, github, git, smb, jupyter, jlab, libarchive, reference - -The S3 backend, in particular, may provide additional functionality for accessing data on DANDI. See the -`fsspec documentation on known implementations `_ -for a full updated list of supported store formats. - -Streaming Method 2: ROS3 ------------------------- -ROS3 is one of the supported methods for reading data from a remote store. ROS3 stands for "read only S3" and is a -driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored remotely in s3 buckets. Using this method -requires that your HDF5 library is installed with the ROS3 driver enabled. This is not the default configuration, -so you will need to make sure you install the right version of ``h5py`` that has this advanced configuration enabled. -You can install HDF5 with the ROS3 driver from `conda-forge `_ using ``conda``. You may -first need to uninstall a currently installed version of ``h5py``. - -.. code-block:: bash - - pip uninstall h5py - conda install -c conda-forge "h5py>=3.2" - -Now instantiate a :py:class:`~pynwb.NWBHDF5IO` object with the S3 URL and specify the driver as "ros3". This -will download metadata about the file from the S3 bucket to memory. The values of datasets are accessed lazily, -just like when reading an NWB file stored locally. So, slicing into a dataset will require additional time to -download the sliced data (and only the sliced data) to memory. - -.. code-block:: python - - from pynwb import NWBHDF5IO - - with NWBHDF5IO(s3_url, mode='r', load_namespaces=True, driver='ros3') as io: - nwbfile = io.read() - print(nwbfile) - print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) - -Which streaming method to choose? ---------------------------------- - -fsspec has many advantages over ros3: - -1. fsspec is easier to install -2. fsspec supports caching, which will dramatically speed up repeated requests for the - same region of data -3. fsspec automatically retries when s3 fails to return. -4. fsspec works with other storage backends and -5. fsspec works with other types of files. -6. In our hands, fsspec is faster out-of-the-box. - -For these reasons, we would recommend use fsspec for most Python users. """ # sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnails_streaming.png' + +from dandi.dandiapi import DandiAPIClient + +dandiset_id = '000006' # ephys dataset from the Svoboda Lab +filepath = 'sub-anm372795/sub-anm372795_ses-20170718.nwb' # 450 kB file +with DandiAPIClient() as client: + asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath) + s3_url = asset.get_content_url(follow_redirects=1, strip_query=True) + +############################################## +# Streaming Method 1: fsspec +# -------------------------- +# fsspec is another data streaming approach that is quite flexible and has several performance advantages. This library +# creates a virtual filesystem for remote stores. With this approach, a virtual file is created for the file and +# the virtual filesystem layer takes care of requesting data from the S3 bucket whenever data is +# read from the virtual file. Note that this implementation is completely unaware of internals of the HDF5 format +# and thus can work for **any** file, not only for the purpose of use with H5PY and PyNWB. +# +# First install ``fsspec`` and the dependencies of the :py:class:`~fsspec.implementations.http.HTTPFileSystem`: +# +# .. code-block:: bash +# +# pip install fsspec requests aiohttp +# +# Then in Python: + +import fsspec +import pynwb +import h5py +from fsspec.implementations.cached import CachingFileSystem + +# first, create a virtual filesystem based on the http protocol +fs=fsspec.filesystem("http") + +# create a cache to save downloaded data to disk (optional) +fs = CachingFileSystem( + fs=fs, + cache_storage="nwb-cache", # Local folder for the cache +) + +# next, open the file +with fs.open(s3_url, "rb") as f: + with h5py.File(f) as file: + with pynwb.NWBHDF5IO(file=file, load_namespaces=True) as io: + nwbfile = io.read() + print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) + +################################## +# fsspec is a library that can be used to access a variety of different store formats, including (at the time of +# writing): +# +# .. code-block:: python +# +# from fsspec.registry import known_implementations +# known_implementations.keys() +# +# file, memory, dropbox, http, https, zip, tar, gcs, gs, gdrive, sftp, ssh, ftp, hdfs, arrow_hdfs, webhdfs, s3, s3a, +# wandb, oci, adl, abfs, az, cached, blockcache, filecache, simplecache, dask, dbfs, github, git, smb, jupyter, jlab, +# libarchive, reference +# +# The S3 backend, in particular, may provide additional functionality for accessing data on DANDI. See the +# `fsspec documentation on known implementations `_ +# for a full updated list of supported store formats. +# +# Streaming Method 2: ROS3 +# ------------------------ +# ROS3 is one of the supported methods for reading data from a remote store. ROS3 stands for "read only S3" and is a +# driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored remotely in s3 buckets. Using this method +# requires that your HDF5 library is installed with the ROS3 driver enabled. This is not the default configuration, +# so you will need to make sure you install the right version of ``h5py`` that has this advanced configuration enabled. +# You can install HDF5 with the ROS3 driver from `conda-forge `_ using ``conda``. You may +# first need to uninstall a currently installed version of ``h5py``. +# +# .. code-block:: bash +# +# pip uninstall h5py +# conda install -c conda-forge "h5py>=3.2" +# +# Now instantiate a :py:class:`~pynwb.NWBHDF5IO` object with the S3 URL and specify the driver as "ros3". This +# will download metadata about the file from the S3 bucket to memory. The values of datasets are accessed lazily, +# just like when reading an NWB file stored locally. So, slicing into a dataset will require additional time to +# download the sliced data (and only the sliced data) to memory. + +from pynwb import NWBHDF5IO + +with NWBHDF5IO(s3_url, mode='r', load_namespaces=True, driver='ros3') as io: + nwbfile = io.read() + print(nwbfile) + print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) + +################################################## +# Which streaming method to choose? +# --------------------------------- +# +# fsspec has many advantages over ros3: +# +# 1. fsspec is easier to install +# 2. fsspec supports caching, which will dramatically speed up repeated requests for the +# same region of data +# 3. fsspec automatically retries when s3 fails to return. +# 4. fsspec works with other storage backends and +# 5. fsspec works with other types of files. +# 6. In our hands, fsspec is faster out-of-the-box. +# +# For these reasons, we would recommend use fsspec for most Python users. From 12ad86fe900840a06a33ab9185286778f85a9020 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 12:09:55 -0700 Subject: [PATCH 02/18] add fsspec to docs requirements --- requirements-doc.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-doc.txt b/requirements-doc.txt index 83fd3c82f..af462b93c 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,4 +12,5 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 +fsspec>=2023.6.0 From 64e05d6e2bf47309a30047298d19dcd1723fbb74 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 13:52:53 -0700 Subject: [PATCH 03/18] move fsspec to requirements-dev.txt --- requirements-dev.txt | 1 + requirements-doc.txt | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index a19b50bd3..f2bcd4fee 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,3 +12,4 @@ isort==5.12.0 pytest==7.1.2 pytest-cov==4.0.0 tox==4.4.8 +fsspec==2023.6.0 diff --git a/requirements-doc.txt b/requirements-doc.txt index af462b93c..83fd3c82f 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,5 +12,4 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 -fsspec>=2023.6.0 From 50ffa5873c1ba1ccd1cde8d273bab4c039798d57 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 14:12:24 -0700 Subject: [PATCH 04/18] move fsspec to environment-ros3.yml --- environment-ros3.yml | 1 + requirements-dev.txt | 1 - requirements-doc.txt | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/environment-ros3.yml b/environment-ros3.yml index 3244a32d2..3d3b966c0 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -13,3 +13,4 @@ dependencies: - python-dateutil==2.8.2 - setuptools - dandi==0.55.1 # NOTE: dandi does not support osx-arm64 + - fsspec==2023.6.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index f2bcd4fee..a19b50bd3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,4 +12,3 @@ isort==5.12.0 pytest==7.1.2 pytest-cov==4.0.0 tox==4.4.8 -fsspec==2023.6.0 diff --git a/requirements-doc.txt b/requirements-doc.txt index 83fd3c82f..2050f4439 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,4 +12,3 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 - From 793526da93c98f28c347cfcab651b626158a2834 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 14:14:54 -0700 Subject: [PATCH 05/18] flake8 --- docs/gallery/advanced_io/streaming.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index bc90c96f2..33d41c355 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -32,8 +32,8 @@ dandiset_id = '000006' # ephys dataset from the Svoboda Lab filepath = 'sub-anm372795/sub-anm372795_ses-20170718.nwb' # 450 kB file with DandiAPIClient() as client: - asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath) - s3_url = asset.get_content_url(follow_redirects=1, strip_query=True) + asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath) + s3_url = asset.get_content_url(follow_redirects=1, strip_query=True) ############################################## # Streaming Method 1: fsspec @@ -58,7 +58,7 @@ from fsspec.implementations.cached import CachingFileSystem # first, create a virtual filesystem based on the http protocol -fs=fsspec.filesystem("http") +fs = fsspec.filesystem("http") # create a cache to save downloaded data to disk (optional) fs = CachingFileSystem( @@ -112,9 +112,9 @@ from pynwb import NWBHDF5IO with NWBHDF5IO(s3_url, mode='r', load_namespaces=True, driver='ros3') as io: - nwbfile = io.read() - print(nwbfile) - print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) + nwbfile = io.read() + print(nwbfile) + print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) ################################################## # Which streaming method to choose? From d82443169502d799971f155c8d07b610d4cb0b5b Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 14:37:11 -0700 Subject: [PATCH 06/18] add required dependencies --- environment-ros3.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment-ros3.yml b/environment-ros3.yml index 3d3b966c0..cf7bdafea 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -14,3 +14,5 @@ dependencies: - setuptools - dandi==0.55.1 # NOTE: dandi does not support osx-arm64 - fsspec==2023.6.0 + - requests==2.28.1 + - aiohttp==3.8.3 \ No newline at end of file From 4cad986548f112d8e935fd025eb483134a670205 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 15:06:12 -0700 Subject: [PATCH 07/18] add remfile to streaming.py tutorial --- docs/gallery/advanced_io/streaming.py | 24 ++++++++++++++++++++++++ environment-ros3.yml | 3 ++- src/pynwb/__init__.py | 3 ++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 33d41c355..2611c960b 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -117,6 +117,30 @@ print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) ################################################## +# Method 3: remfile +# ----------------- +# remfile is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and allows for +# caching of data in the local filesystem. The one caveat of remfile is that it is a very new project that has not +# been tested in a variety of use-cases. You can install remfile with pip: +# +# .. code-block:: bash +# +# pip install remfile +# +# Then remfile can be used similarly to fsspec + +import h5py +from pynwb import NWBHDF5IO +import remfile + +file = remfile.File(s3_url) + +with h5py.File(file, "r") as f: + with NWBHDF5IO(file=file, load_namespaces=True) as io: + nwbfile = io.read() + print(nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:]) + +################################################ # Which streaming method to choose? # --------------------------------- # diff --git a/environment-ros3.yml b/environment-ros3.yml index cf7bdafea..a7e83367e 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -15,4 +15,5 @@ dependencies: - dandi==0.55.1 # NOTE: dandi does not support osx-arm64 - fsspec==2023.6.0 - requests==2.28.1 - - aiohttp==3.8.3 \ No newline at end of file + - aiohttp==3.8.3 + - remfile==0.1.7 \ No newline at end of file diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index 181079970..fc3aced3b 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -212,7 +212,8 @@ class NWBHDF5IO(_HDF5IO): {'name': 'extensions', 'type': (str, TypeMap, list), 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', 'default': None}, - {'name': 'file', 'type': [h5py.File, 'S3File'], 'doc': 'a pre-existing h5py.File object', 'default': None}, + {'name': 'file', 'type': [h5py.File, 'S3File', "RemFile"], 'doc': 'a pre-existing h5py.File object', + 'default': None}, {'name': 'comm', 'type': "Intracomm", 'doc': 'the MPI communicator to use for parallel I/O', 'default': None}, {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, From 504581559f53f75f41031d4852bb34f0fc01da36 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 15:51:22 -0700 Subject: [PATCH 08/18] adjust language --- docs/gallery/advanced_io/streaming.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 2611c960b..2012ba599 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -120,8 +120,8 @@ # Method 3: remfile # ----------------- # remfile is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and allows for -# caching of data in the local filesystem. The one caveat of remfile is that it is a very new project that has not -# been tested in a variety of use-cases. You can install remfile with pip: +# caching of data in the local filesystem. The one caveat of remfile is that it is a very new project that has not been +# tested in a variety of use-cases. You can install remfile with pip: # # .. code-block:: bash # @@ -143,15 +143,14 @@ ################################################ # Which streaming method to choose? # --------------------------------- -# -# fsspec has many advantages over ros3: -# -# 1. fsspec is easier to install -# 2. fsspec supports caching, which will dramatically speed up repeated requests for the -# same region of data -# 3. fsspec automatically retries when s3 fails to return. -# 4. fsspec works with other storage backends and -# 5. fsspec works with other types of files. -# 6. In our hands, fsspec is faster out-of-the-box. +# When choosing a streaming method, consider these comparisons: +# +# 1. fsspec and remfile are easier to install than ros3, requiring a simple pip-install that works across platforms. +# 2. fsspec and remfile support caching, which will dramatically speed up repeated requests for the same region of data. +# 3. fsspec and remfile automatically retries when s3 fails to return due to brief connection errors. +# 4. fsspec works with other storage backends. remfile and ros3 only work with s3. +# 5. fsspec and remfile work with other types of files. ros3 is a driver designed specifically for HDF5 files. +# 6. In our hands, remfile is fastest, then fsspec, then ros3, though this could vary by file and access pattern. +# 7. fsspec and ros3 are well-supported and popular community tool. remfile is much newer and not as well adopted. # # For these reasons, we would recommend use fsspec for most Python users. From bab2f9e0d34bb9552b7d36b893acd025119f6c33 Mon Sep 17 00:00:00 2001 From: bendichter Date: Thu, 17 Aug 2023 17:14:33 -0700 Subject: [PATCH 09/18] move remfile to pip install --- environment-ros3.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environment-ros3.yml b/environment-ros3.yml index a7e83367e..a1ab83ff4 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -16,4 +16,6 @@ dependencies: - fsspec==2023.6.0 - requests==2.28.1 - aiohttp==3.8.3 - - remfile==0.1.7 \ No newline at end of file + - pip + - pip: + - remfile==0.1.7 \ No newline at end of file From 65c34aa68cd3215739aae47fae26f69bd34a825b Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Sun, 22 Oct 2023 11:19:39 -0700 Subject: [PATCH 10/18] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a3a79232..a5c7c150c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Enhancements and minor changes - Add `NWBHDF5IO.can_read()`. @bendichter [#1703](https://github.com/NeurodataWithoutBorders/pynwb/pull/1703) - Add `pynwb.get_nwbfile_version()`. @bendichter [#1703](https://github.com/NeurodataWithoutBorders/pynwb/pull/1703) +- Add support for `RemFile` in `NWBHDF5IO`. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) ## PyNWB 2.5.0 (August 18, 2023) From 31c5f5f5d7e81d582f8fcafae053921c2ccebe1d Mon Sep 17 00:00:00 2001 From: bendichter Date: Mon, 27 Nov 2023 10:26:05 -0500 Subject: [PATCH 11/18] * change tutorial to use correct h5py.File object * rmv addition of RemFile as an allowed type for NWBHDF5IO --- docs/gallery/advanced_io/streaming.py | 6 +++--- src/pynwb/__init__.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 48c6d0199..5e6ba6906 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -139,10 +139,10 @@ from pynwb import NWBHDF5IO import remfile -file = remfile.File(s3_url) +rem_file = remfile.File(s3_url) -with h5py.File(file, "r") as f: - with NWBHDF5IO(file=file, load_namespaces=True) as io: +with h5py.File(rem_file, "r") as h5py_file: + with NWBHDF5IO(file=h5py_file, load_namespaces=True) as io: nwbfile = io.read() print(nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:]) diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index 98205fea0..53d1385aa 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -250,7 +250,7 @@ def can_read(path: str): {'name': 'extensions', 'type': (str, TypeMap, list), 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', 'default': None}, - {'name': 'file', 'type': [h5py.File, 'S3File', "RemFile"], 'doc': 'a pre-existing h5py.File object', + {'name': 'file', 'type': [h5py.File, 'S3File'], 'doc': 'a pre-existing h5py.File object', 'default': None}, {'name': 'comm', 'type': "Intracomm", 'doc': 'the MPI communicator to use for parallel I/O', 'default': None}, From 3a03c49f292d715879a6bae43e5ddc64586e8d63 Mon Sep 17 00:00:00 2001 From: bendichter Date: Mon, 27 Nov 2023 10:36:22 -0500 Subject: [PATCH 12/18] update CHANGELOG.md --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5c7c150c..7483693a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,9 @@ ### Enhancements and minor changes - Add `NWBHDF5IO.can_read()`. @bendichter [#1703](https://github.com/NeurodataWithoutBorders/pynwb/pull/1703) - Add `pynwb.get_nwbfile_version()`. @bendichter [#1703](https://github.com/NeurodataWithoutBorders/pynwb/pull/1703) -- Add support for `RemFile` in `NWBHDF5IO`. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) + +### Documentation and tutorial enhancements +- Add RemFile to streaming tutorial @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) ## PyNWB 2.5.0 (August 18, 2023) From 77331a291f9d9d46aef86ca3ce4f4cb3cd9b433f Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 27 Nov 2023 10:37:17 -0500 Subject: [PATCH 13/18] Update src/pynwb/__init__.py --- src/pynwb/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index 53d1385aa..710f55ee8 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -250,8 +250,7 @@ def can_read(path: str): {'name': 'extensions', 'type': (str, TypeMap, list), 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', 'default': None}, - {'name': 'file', 'type': [h5py.File, 'S3File'], 'doc': 'a pre-existing h5py.File object', - 'default': None}, + {'name': 'file', 'type': [h5py.File, 'S3File'], 'doc': 'a pre-existing h5py.File object', 'default': None}, {'name': 'comm', 'type': "Intracomm", 'doc': 'the MPI communicator to use for parallel I/O', 'default': None}, {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, From ea722839b31d2cfd16df126b5956de9c8fcb2e8f Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 27 Nov 2023 13:39:25 -0500 Subject: [PATCH 14/18] Update environment-ros3.yml Co-authored-by: Ryan Ly --- environment-ros3.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment-ros3.yml b/environment-ros3.yml index c27b10ae9..633dd88cb 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -18,4 +18,4 @@ dependencies: - aiohttp==3.8.3 - pip - pip: - - remfile==0.1.7 + - remfile==0.1.9 From 59fe561235a72b14535748555facb4dc68c4903e Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 27 Nov 2023 13:41:39 -0500 Subject: [PATCH 15/18] Update docs/gallery/advanced_io/streaming.py Co-authored-by: Ryan Ly --- docs/gallery/advanced_io/streaming.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 5e6ba6906..bcf18a122 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -126,8 +126,8 @@ # Method 3: remfile # ----------------- # remfile is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and allows for -# caching of data in the local filesystem. The one caveat of remfile is that it is a very new project that has not been -# tested in a variety of use-cases. You can install remfile with pip: +# caching of data in the local filesystem. The caveats of remfile are that it is a very new project that has not been +# tested in a variety of use-cases and caching options are limited compared to fsspec. You can install remfile with pip: # # .. code-block:: bash # From d34a526a3049abc9d13d6b37616f1ff36b3e2ba5 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 27 Nov 2023 13:41:51 -0500 Subject: [PATCH 16/18] Update docs/gallery/advanced_io/streaming.py Co-authored-by: Ryan Ly --- docs/gallery/advanced_io/streaming.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index bcf18a122..4cacb184f 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -133,7 +133,6 @@ # # pip install remfile # -# Then remfile can be used similarly to fsspec import h5py from pynwb import NWBHDF5IO From c16736bccbae65c895a09de3dff40e90187d1d08 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Sat, 13 Jan 2024 00:25:31 -0800 Subject: [PATCH 17/18] Update streaming.py --- docs/gallery/advanced_io/streaming.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 73d0e45ff..e0142cc81 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -90,6 +90,9 @@ # `fsspec documentation on known implementations `_ # for a full updated list of supported store formats. # +# One downside of this fsspec method is that fsspec is not optimized for reading HDF5 files, and so streaming data +# using this method can be slow. A faster alternative is ``remfile`` described below. +# # Streaming Method 2: ROS3 # ------------------------ # ROS3 stands for "read only S3" and is a driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored @@ -120,14 +123,18 @@ # # pip uninstall h5py # conda install -c conda-forge "h5py>=3.2" +# +# Besides the extra burden of installing h5py from a non-PyPI source, one downside of this ROS3 method is that +# this method does not support automatic retries in case the connection fails. ################################################## # Method 3: remfile # ----------------- -# remfile is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and allows for -# caching of data in the local filesystem. The caveats of remfile are that it is a very new project that has not been -# tested in a variety of use-cases and caching options are limited compared to fsspec. You can install remfile with pip: +# ``remfile`` is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and +# allows for caching of data in the local filesystem. The caveats of ``remfile`` are that it is a very new project +# that has not been tested in a variety of use-cases and caching options are limited compared to ``fsspec``. +# You can install ``remfile`` with pip: # # .. code-block:: bash # @@ -150,8 +157,9 @@ # --------------------------------- # # From a user perspective, once opened, the :py:class:`~pynwb.file.NWBFile` works the same with -# both fsspec and ros3. However, in general, we currently recommend using fsspec for streaming -# NWB files because it is more performant and reliable than ros3. In particular fsspec: +# fsspec, ros3, or remfile. However, in general, we currently recommend using fsspec for streaming +# NWB files because it is more performant and reliable than ros3 and more widely tested than remfile. +# In particular, fsspec: # # 1. supports caching, which will dramatically speed up repeated requests for the # same region of data, From 226f41dc3cfe4d7509ec4b3b652fda509a33d849 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Sat, 13 Jan 2024 00:26:32 -0800 Subject: [PATCH 18/18] Update streaming.py --- docs/gallery/advanced_io/streaming.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index e0142cc81..101400c2d 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -125,7 +125,7 @@ # conda install -c conda-forge "h5py>=3.2" # # Besides the extra burden of installing h5py from a non-PyPI source, one downside of this ROS3 method is that -# this method does not support automatic retries in case the connection fails. +# this method does not support automatic retries in case the connection fails. ##################################################