diff --git a/.gitignore b/.gitignore index 1767f2597..0c81cb605 100644 --- a/.gitignore +++ b/.gitignore @@ -81,4 +81,4 @@ _version.py .core_typemap_version core_typemap.pkl -venv/ +venv \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 2230cbd49..5ed9ddac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,13 @@ - Deprecated `EventWaveform` neurodata type. @rly [#1940](https://github.com/NeurodataWithoutBorders/pynwb/pull/1940) - Deprecated `ImageMaskSeries` neurodata type. @rly [#1941](https://github.com/NeurodataWithoutBorders/pynwb/pull/1941) - Removed python 3.8 support, added python 3.13 support. @stephprince [#2007](https://github.com/NeurodataWithoutBorders/pynwb/pull/2007) +- `mock_ElectricalSeries`. Make number of electrodes between data and electrode region agree when explicitly passing data @h-mayorquin [#2019](https://github.com/NeurodataWithoutBorders/pynwb/pull/2019) ### Documentation and tutorial enhancements - Updated `SpikeEventSeries`, `DecompositionSeries`, and `FilteredEphys` examples. @stephprince [#2012](https://github.com/NeurodataWithoutBorders/pynwb/pull/2012) - Replaced deprecated `scipy.misc.face` dataset in the images tutorial with another example. @stephprince [#2016](https://github.com/NeurodataWithoutBorders/pynwb/pull/2016) - +- Removed Allen Brain Observatory example which was unnecessary and difficult to maintain. @rly [#2026](https://github.com/NeurodataWithoutBorders/pynwb/pull/2026) + ## PyNWB 2.8.3 (November 19, 2024) ### Enhancements and minor changes diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 4bdc992b8..428ebe300 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -6,9 +6,8 @@ You can read specific sections within individual data files directly from remote stores such as the `DANDI Archive `_. This is especially useful for reading small pieces of data -from a large NWB file stored -remotely. First, you will need to get the location of the file. The code below illustrates how to do this on DANDI -using the dandi API library. +from a large NWB file stored remotely. First, you will need to get the location of the file. The code +below illustrates how to do this on DANDI using the dandi API library. Getting the location of the file on DANDI ----------------------------------------- @@ -41,13 +40,68 @@ s3_url = asset.get_content_url(follow_redirects=1, strip_query=True) ############################################## -# Streaming Method 1: fsspec -# -------------------------- -# fsspec is another data streaming approach that is quite flexible and has several performance advantages. This library -# creates a virtual filesystem for remote stores. With this approach, a virtual file is created for the file and -# the virtual filesystem layer takes care of requesting data from the S3 bucket whenever data is -# read from the virtual file. Note that this implementation is completely unaware of internals of the HDF5 format -# and thus can work for **any** file, not only for the purpose of use with H5PY and PyNWB. +# Once you have an S3 URL, you can use it to read the NWB file directly from the remote store. There are several +# ways to do this, including using the ``remfile`` library, the ``fsspec`` library, or the ROS3 driver in h5py. +# +# Streaming data with ``remfile`` +# ------------------------------- +# ``remfile`` is a library that enables indexing and streaming of files in s3, optimized for reading HDF5 files. +# remfile is simple and fast, especially for the initial load of the nwb file and for accessing small pieces of data. +# It is a lightweight dependency with a very small codebase. Although ``remfile`` is a very new project that has not +# been tested in a variety of use-cases, but it has worked well in our hands. +# +# You can install ``remfile`` with pip: +# +# .. code-block:: bash +# +# pip install remfile +# +# Then in use Python: + +import h5py +from pynwb import NWBHDF5IO +import remfile + +# Create a disk cache to store downloaded data (optional) +cache_dirname = '/tmp/remfile_cache' +disk_cache = remfile.DiskCache(cache_dirname) + +# open the file +rem_file = remfile.File(s3_url, disk_cache=disk_cache) +h5py_file = h5py.File(rem_file, "r") +io = NWBHDF5IO(file=h5py_file) +nwbfile = io.read() + +# now you can access the data +streamed_data = nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:] + +# close the file +io.close() +h5py_file.close() +rem_file.close() + +################################## +# You can also use contexts to open the file. This will automatically close the file when the context is exited. +# This approach can be a bit cumbersome when exploring files interactively, but is the preferred approach once +# the program is finalized because it will ensure that the file is closed properly even if an exception is raised. + +rem_file = remfile.File(s3_url, disk_cache=disk_cache) +with h5py.File(rem_file, "r") as h5py_file: + with NWBHDF5IO(file=h5py_file, load_namespaces=True) as io: + nwbfile = io.read() + streamed_data = nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:] + +# After the contexts end, the file is closed, so you cannot download new data from the file. + +################################# +# Streaming data with ``fsspec`` +# ------------------------------ +# ``fsspec`` is a data streaming approach that is quite flexible. This library creates a virtual filesystem for remote +# stores. With this approach, a virtual file is created for the file and the virtual filesystem layer takes care of +# requesting data from the S3 bucket whenever data is read from the virtual file. Note that this implementation is +# completely unaware of internals of the HDF5 format and thus can work for **any** file, not only for the purpose of +# use with ``h5py`` and PyNWB. ``fsspec`` can also be used to access data from other storage backends, such as Google +# Drive or Dropbox. # # First install ``fsspec`` and the dependencies of the :py:class:`~fsspec.implementations.http.HTTPFileSystem`: # @@ -71,7 +125,23 @@ cache_storage="nwb-cache", # Local folder for the cache ) -# next, open the file +# open the file +f = fs.open(s3_url, "rb") +file = h5py.File(f) +io = pynwb.NWBHDF5IO(file=file) +nwbfile = io.read() + +# now you can access the data +streamed_data = nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:] + +# close the file +io.close() +file.close() +f.close() + +################################## +# You can also use context managers to open the file. This will automatically close the file when the context is exited. + with fs.open(s3_url, "rb") as f: with h5py.File(f) as file: with pynwb.NWBHDF5IO(file=file) as io: @@ -79,101 +149,60 @@ print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) ################################## -# fsspec is a library that can be used to access a variety of different store formats, including (at the time of -# writing): +# fsspec can be used to access a variety of different stores, including (at the time of writing): # # .. code-block:: python # # from fsspec.registry import known_implementations # known_implementations.keys() # -# file, memory, dropbox, http, https, zip, tar, gcs, gs, gdrive, sftp, ssh, ftp, hdfs, arrow_hdfs, webhdfs, s3, s3a, -# wandb, oci, adl, abfs, az, cached, blockcache, filecache, simplecache, dask, dbfs, github, git, smb, jupyter, jlab, -# libarchive, reference +# abfs, adl, arrow_hdfs, asynclocal, az, blockcache, box, cached, dask, data, dbfs, dir, dropbox, dvc, +# file, filecache, ftp, gcs, gdrive, generic, git, github, gs, hdfs, hf, http, https, jlab, jupyter, +# lakefs, libarchive, local, memory, oci, ocilake, oss, reference, root, s3, s3a, sftp, simplecache, +# smb, ssh, tar, wandb, webdav, webhdfs, zip # # The S3 backend, in particular, may provide additional functionality for accessing data on DANDI. See the -# `fsspec documentation on known implementations `_ +# `fsspec documentation on known implementations +# `_ # for a full updated list of supported store formats. # -# One downside of this fsspec method is that fsspec is not optimized for reading HDF5 files, and so streaming data -# using this method can be slow. A faster alternative is ``remfile`` described below. +# One downside of the fsspec method is that fsspec is not optimized for reading HDF5 files, and so streaming data +# using this method can be slow. ``remfile`` may be a faster alternative. # -# Streaming Method 2: ROS3 +# Streaming data with ROS3 # ------------------------ # ROS3 stands for "read only S3" and is a driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored # remotely in s3 buckets. Using this method requires that your HDF5 library is installed with the ROS3 driver enabled. # With ROS3 support enabled in h5py, we can instantiate a :py:class:`~pynwb.NWBHDF5IO` object with the S3 URL and -# specify the driver as "ros3". +# specify the driver as "ros3". Like the other methods, you can use a context manager to open the file and close it, +# or open the file and close it manually. from pynwb import NWBHDF5IO +# open with context manager with NWBHDF5IO(s3_url, mode='r', driver='ros3') as io: nwbfile = io.read() - print(nwbfile) - print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) + streamed_data = nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:] + +# open and close manually +io = NWBHDF5IO(s3_url, mode='r', driver='ros3') +nwbfile = io.read() +streamed_data = nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:] +io.close() ################################## # This will download metadata about the file from the S3 bucket to memory. The values of datasets are accessed lazily, -# just like when reading an NWB file stored locally. So, slicing into a dataset will require additional time to -# download the sliced data (and only the sliced data) to memory. +# just like when reading an NWB file stored locally. So, slicing into a dataset will download the sliced data (and +# only the sliced data) and load it directly to memory. # # .. note:: # -# Pre-built h5py packages on PyPI do not include this S3 support. If you want this feature, you could use packages -# from conda-forge, or build h5py from source against an HDF5 build with S3 support. You can install HDF5 with -# the ROS3 driver from `conda-forge `_ using ``conda``. You may -# first need to uninstall a currently installed version of ``h5py``. +# Pre-built h5py packages on PyPI do not include this S3 support. If you want this feature, we recommend installing +# ``h5py`` using conda: # # .. code-block:: bash # # pip uninstall h5py -# conda install -c conda-forge "h5py>=3.2" -# -# Besides the extra burden of installing h5py from a non-PyPI source, one downside of this ROS3 method is that -# this method does not support automatic retries in case the connection fails. - - -################################################## -# Method 3: remfile -# ----------------- -# ``remfile`` is another library that enables indexing and streaming of files in s3. remfile is simple and fast, -# especially for the initial load of the nwb file and for accessing small pieces of data. The caveats of ``remfile`` -# are that it is a very new project that has not been tested in a variety of use-cases and caching options are -# limited compared to ``fsspec``. `remfile` is a simple, lightweight dependency with a very small codebase. -# -# You can install ``remfile`` with pip: -# -# .. code-block:: bash +# conda install h5py # -# pip install remfile -# - -import h5py -from pynwb import NWBHDF5IO -import remfile - -rem_file = remfile.File(s3_url) - -with h5py.File(rem_file, "r") as h5py_file: - with NWBHDF5IO(file=h5py_file, load_namespaces=True) as io: - nwbfile = io.read() - print(nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:]) - -################################################## -# Which streaming method to choose? -# --------------------------------- -# -# From a user perspective, once opened, the :py:class:`~pynwb.file.NWBFile` works the same with -# fsspec, ros3, or remfile. However, in general, we currently recommend using fsspec for streaming -# NWB files because it is more performant and reliable than ros3 and more widely tested than remfile. -# However, if you are experiencing long wait times for the initial file load on your network, you -# may want to try remfile. -# -# Advantages of fsspec include: -# -# 1. supports caching, which will dramatically speed up repeated requests for the -# same region of data, -# 2. automatically retries when s3 fails to return, which helps avoid errors when accessing data due to -# intermittent errors in connections with S3 (remfile does this as well), -# 3. works also with other storage backends (e.g., GoogleDrive or Dropbox, not just S3) and file formats, and -# 4. in our experience appears to provide faster out-of-the-box performance than the ros3 driver. +# Alternatively, you can build h5py from source against an HDF5 build with S3 support, but this is more complicated. diff --git a/docs/gallery/domain/brain_observatory.py b/docs/gallery/domain/brain_observatory.py deleted file mode 100644 index 2e1a9d6a3..000000000 --- a/docs/gallery/domain/brain_observatory.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -Allen Brain Observatory -================================= - -Create an nwb file from Allen Brain Observatory data. -""" - -######################################## -# This example demonstrates the basic functionality of several parts of the pynwb write API, centered around the optical -# physiology submodule (pynwb.ophys). We will use the allensdk as a read API, while leveraging the pynwb data model and -# write api to transform and write the data back to disk. -# -# .. note: Using the latest allensdk package requires Python 3.6 or higher. - -######################################## -# .. raw:: html -# :url: https://gist.githubusercontent.com/nicain/82e6b3d8f9ff5b85ef01a582e41e2389/raw/ - -# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnails_allenbrainobservatory.png' - -import allensdk.brain_observatory.stimulus_info as si -from allensdk.core.brain_observatory_cache import BrainObservatoryCache - -from pynwb import NWBHDF5IO, NWBFile, TimeSeries -from pynwb.device import Device -from pynwb.image import ImageSeries, IndexSeries -from pynwb.ophys import DfOverF, ImageSegmentation, OpticalChannel - -# Settings: -ophys_experiment_id = 562095852 -save_file_name = "brain_observatory.nwb" - -######################################## -# Let's begin by downloading an Allen Institute Brain Observatory file. After we cache this file locally (approx. 450 -# MB), we can open data assets we wish to write into our NWB:N file. These include stimulus, acquisition, and -# processing data, as well as time "epochs" (intervals of interest)". -boc = BrainObservatoryCache(manifest_file="manifest.json") -dataset = boc.get_ophys_experiment_data(ophys_experiment_id) -metadata = dataset.get_metadata() -cell_specimen_ids = dataset.get_cell_specimen_ids() -timestamps, dFF = dataset.get_dff_traces() -stimulus_list = [ - s for s in si.SESSION_STIMULUS_MAP[metadata["session_type"]] if s != "spontaneous" -] -running_data, _ = dataset.get_running_speed() -trial_table = dataset.get_stimulus_table("master") -trial_table["start"] = timestamps[trial_table["start"].values] -trial_table["end"] = timestamps[trial_table["end"].values] -epoch_table = dataset.get_stimulus_epoch_table() -epoch_table["start"] = timestamps[epoch_table["start"].values] -epoch_table["end"] = timestamps[epoch_table["end"].values] - -######################################## -# 1) First, lets create a top-level "file" container object. All the other NWB:N data components will be stored -# hierarchically, relative to this container. The data won't actually be written to the file system until the end of -# the script. - -nwbfile = NWBFile( - session_description="Allen Brain Observatory dataset", - identifier=str(metadata["ophys_experiment_id"]), - session_start_time=metadata["session_start_time"], -) - - -######################################## -# 2) Next, we add stimuli templates (one for each type of stimulus), and a data series that indexes these templates to -# describe what stimulus was being shown during the experiment. -for stimulus in stimulus_list: - visual_stimulus_images = ImageSeries( - name=stimulus, - data=dataset.get_stimulus_template(stimulus), - unit="NA", - format="raw", - timestamps=[0.0], - ) - image_index = IndexSeries( - name=stimulus, - data=dataset.get_stimulus_table(stimulus).frame.values, - unit="NA", - indexed_timeseries=visual_stimulus_images, - timestamps=timestamps[dataset.get_stimulus_table(stimulus).start.values], - ) - nwbfile.add_stimulus_template(visual_stimulus_images) - nwbfile.add_stimulus(image_index) - -######################################## -# 3) Besides the two-photon calcium image stack, the running speed of the animal was also recorded in this experiment. -# We can store this data as a TimeSeries, in the acquisition portion of the file. - -running_speed = TimeSeries( - name="running_speed", data=running_data, timestamps=timestamps, unit="cm/s" -) - -nwbfile.add_acquisition(running_speed) - -######################################## -# 4) In NWB:N, an "epoch" is an interval of experiment time that can slice into a timeseries (for example running_speed, -# the one we just added). PyNWB uses an object-oriented approach to create links into these timeseries, so that data is -# not copied multiple times. Here, we extract the stimulus epochs (both fine and coarse-grained) from the Brain -# Observatory experiment using the allensdk. - -for _, row in trial_table.iterrows(): - nwbfile.add_epoch( - start_time=row.start, - stop_time=row.end, - timeseries=[running_speed], - tags="trials", - ) - -for _, row in epoch_table.iterrows(): - nwbfile.add_epoch( - start_time=row.start, - stop_time=row.end, - timeseries=[running_speed], - tags="stimulus", - ) - -######################################## -# 5) In the brain observatory, a two-photon microscope is used to acquire images of the calcium activity of neurons -# expressing a fluorescent protein indicator. Essentially the microscope captures picture (30 times a second) at a -# single depth in the visual cortex (the imaging plane). Let's use pynwb to store the metadata associated with this -# hardware and experimental setup: -optical_channel = OpticalChannel( - name="optical_channel", - description="2P Optical Channel", - emission_lambda=520.0, -) - -device = Device(metadata["device"]) -nwbfile.add_device(device) - -imaging_plane = nwbfile.create_imaging_plane( - name="imaging_plane", - optical_channel=optical_channel, - description="Imaging plane ", - device=device, - excitation_lambda=float(metadata["excitation_lambda"].split(" ")[0]), - imaging_rate=30.0, - indicator="GCaMP6f", - location=metadata["targeted_structure"], - conversion=1.0, - unit="unknown", - reference_frame="unknown", -) - -######################################## -# The Allen Institute does not include the raw imaging signal, as this data would make the file too large. Instead, -# these data are preprocessed, and a dF/F fluorescence signal extracted for each region-of-interest (ROI). To store the -# chain of computations necessary to describe this data processing pipeline, pynwb provides a "processing module" with -# interfaces that simplify and standardize the process of adding the steps in this provenance chain to the file: -ophys_module = nwbfile.create_processing_module( - name="ophys_module", - description="Processing module for 2P calcium responses", -) - -######################################## -# 6) First, we add an image segmentation interface to the module. This interface implements a pre-defined schema and -# API that facilitates writing segmentation masks for ROI's: - -image_segmentation_interface = ImageSegmentation(name="image_segmentation") - -ophys_module.add(image_segmentation_interface) - -plane_segmentation = image_segmentation_interface.create_plane_segmentation( - name="plane_segmentation", - description="Segmentation for imaging plane", - imaging_plane=imaging_plane, -) - -for cell_specimen_id in cell_specimen_ids: - curr_name = cell_specimen_id - curr_image_mask = dataset.get_roi_mask_array([cell_specimen_id])[0] - plane_segmentation.add_roi(id=curr_name, image_mask=curr_image_mask) - -######################################## -# 7) Next, we add a dF/F interface to the module. This allows us to write the dF/F timeseries data associated with -# each ROI. - -dff_interface = DfOverF(name="dff_interface") -ophys_module.add(dff_interface) - -rt_region = plane_segmentation.create_roi_table_region( - description="segmented cells with cell_specimen_ids", -) - -dFF_series = dff_interface.create_roi_response_series( - name="df_over_f", - data=dFF, - unit="NA", - rois=rt_region, - timestamps=timestamps, -) - -######################################## -# Now that we have created the data set, we can write the file to disk: -with NWBHDF5IO(save_file_name, mode="w") as io: - io.write(nwbfile) - -######################################## -# For good measure, lets read the data back in and see if everything went as planned: -with NWBHDF5IO(save_file_name, mode="r") as io: - nwbfile_in = io.read() diff --git a/docs/source/conf.py b/docs/source/conf.py index 741eb5975..5745a314e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -84,7 +84,6 @@ class CustomSphinxGallerySectionSortKey(ExampleTitleSortKey): "icephys.py", "plot_behavior.py", "images.py", - "brain_observatory.py" ], 'advanced_io': [] } diff --git a/requirements-doc.txt b/requirements-doc.txt index 8ff798ff2..86892aa09 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -4,8 +4,6 @@ sphinx>=4 # improved support for docutils>=0.17 sphinx_rtd_theme>=1 # <1 does not work with docutils>=0.17 matplotlib sphinx-gallery -# allensdk>=2.13.2 # allensdk reinstalls pynwb and hdmf. TODO set up a separate workflow to test allensdk -# MarkupSafe==2.0.1 # resolve incompatibility between jinja2 and markupsafe: https://github.com/AllenInstitute/AllenSDK/issues/2308 Pillow sphinx-copybutton dataframe_image # used to render large dataframe as image in the sphinx gallery to improve html display diff --git a/src/pynwb/testing/mock/ecephys.py b/src/pynwb/testing/mock/ecephys.py index 0669e7493..315eb3d9c 100644 --- a/src/pynwb/testing/mock/ecephys.py +++ b/src/pynwb/testing/mock/ecephys.py @@ -83,15 +83,16 @@ def mock_ElectricalSeries( # Set a default rate if timestamps are not provided rate = 30_000.0 if (timestamps is None and rate is None) else rate + n_electrodes = data.shape[1] if data is not None else 5 electrical_series = ElectricalSeries( name=name or name_generator("ElectricalSeries"), description=description, - data=data if data is not None else np.ones((10, 5)), + data=data if data is not None else np.ones((10, n_electrodes)), rate=rate, starting_time=starting_time, timestamps=timestamps, - electrodes=electrodes or mock_electrodes(nwbfile=nwbfile), + electrodes=electrodes or mock_electrodes(nwbfile=nwbfile, n_electrodes=n_electrodes), filtering=filtering, conversion=conversion, offset=offset, diff --git a/src/pynwb/testing/mock/file.py b/src/pynwb/testing/mock/file.py index 943f86dcb..351f81454 100644 --- a/src/pynwb/testing/mock/file.py +++ b/src/pynwb/testing/mock/file.py @@ -1,7 +1,7 @@ from typing import Optional from uuid import uuid4 from datetime import datetime -from dateutil.tz import tzlocal +from dateutil.tz import tzutc from ...file import NWBFile, Subject from .utils import name_generator @@ -10,7 +10,7 @@ def mock_NWBFile( session_description: str = 'session_description', identifier: Optional[str] = None, - session_start_time: datetime = datetime(1970, 1, 1, tzinfo=tzlocal()), + session_start_time: datetime = datetime(1970, 1, 1, tzinfo=tzutc()), **kwargs ): return NWBFile( diff --git a/test.py b/test.py index 570bd4748..6543fe70c 100644 --- a/test.py +++ b/test.py @@ -86,10 +86,6 @@ def _import_from_file(script): os.path.join('advanced_io', 'streaming.py'), ] -allensdk_examples = [ - os.path.join('domain', 'brain_observatory.py'), # TODO create separate workflow for this -] - def run_example_tests(): """Run the Sphinx gallery example files, excluding ROS3-dependent ones, to check for errors.""" @@ -99,7 +95,7 @@ def run_example_tests(): for f in files: if f.endswith(".py"): name_with_parent_dir = os.path.join(os.path.basename(root), f) - if name_with_parent_dir in ros3_examples or name_with_parent_dir in allensdk_examples: + if name_with_parent_dir in ros3_examples: logging.info("Skipping %s" % name_with_parent_dir) continue examples_scripts.append(os.path.join(root, f)) @@ -277,7 +273,6 @@ def clean_up_tests(): "basic_sparse_iterwrite_*.npy", "basics_tutorial.nwb", "behavioral_tutorial.nwb", - "brain_observatory.nwb", "cache_spec_example.nwb", "ecephys_tutorial.nwb", "ecog.extensions.yaml",