From e9c273774b3367c3e6acb7b538e696a60e575d31 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Tue, 12 Mar 2024 22:11:58 +0000 Subject: [PATCH 1/2] Add docs on spec language support (#1069) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 5 +++++ docs/source/index.rst | 1 + docs/source/spec_language_support.rst | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 docs/source/spec_language_support.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index 1294aee02..cd7b69fcb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF Changelog +## HDMF 3.13.0 (Upcoming) + +### Enhancements +- Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069) + ## HDMF 3.12.2 (February 9, 2024) ### Bug fixes diff --git a/docs/source/index.rst b/docs/source/index.rst index e6a53d3ab..2fcd4778a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -45,6 +45,7 @@ If you use HDMF in your research, please use the following citation: building_api export validation + spec_language_support .. toctree:: :hidden: diff --git a/docs/source/spec_language_support.rst b/docs/source/spec_language_support.rst new file mode 100644 index 000000000..43a628093 --- /dev/null +++ b/docs/source/spec_language_support.rst @@ -0,0 +1,21 @@ + +.. _spec_language_support: + +=========================================== +Support for the HDMF Specification Language +=========================================== + +The HDMF API provides nearly full support for all features of the `HDMF Specification Language`_ +version 3.0.0, except for the following: + +1. Attributes containing multiple references (see `#833`_) +2. Certain text and integer values for quantity (see `#423`_, `#531`_) +3. Datasets that do not have a data_type_inc/data_type_def and contain either a reference dtype or a compound dtype (see `#737`_) +4. Passing dataset dtype and shape from parent data type to child data type (see `#320`_) + +.. _HDMF Specification Language: https://hdmf-schema-language.readthedocs.io +.. _#833: https://github.com/hdmf-dev/hdmf/issues/833 +.. _#423: https://github.com/hdmf-dev/hdmf/issues/423 +.. _#531: https://github.com/hdmf-dev/hdmf/issues/531 +.. _#737: https://github.com/hdmf-dev/hdmf/issues/737 +.. _#320: https://github.com/hdmf-dev/hdmf/issues/320 From f092cbbe5d5f110246c8b7518e118add26aa5203 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Thu, 14 Mar 2024 09:16:22 -0700 Subject: [PATCH 2/2] Warn when adding ragged arrays to DynamicTable without index argument (#1066) * add detection of ragged array inputs to table * add tests for ragged array inputs to table * add warnings for ragged inputs to table * update CHANGELOG.md * check only lists and tuples for raggedness * add flag to turn off ragged data checks * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 1 + src/hdmf/common/table.py | 26 +++++++++++-- src/hdmf/utils.py | 14 +++++++ tests/unit/common/test_table.py | 68 +++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd7b69fcb..8eddf8270 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements - Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069) +- Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066) ## HDMF 3.12.2 (February 9, 2024) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 5eeedcd86..3b67ff19d 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -15,7 +15,7 @@ from . import register_class, EXP_NAMESPACE from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type +from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged from ..term_set import TermSetWrapper @@ -639,12 +639,16 @@ def __len__(self): {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None}, {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique', 'default': False}, + {'name': 'check_ragged', 'type': bool, 'default': True, + 'doc': ('whether or not to check for ragged arrays when adding data to the table. ' + 'Set to False to avoid checking every element if performance issues occur.')}, allow_extra=True) def add_row(self, **kwargs): """ Add a row to the table. If *id* is not provided, it will auto-increment. """ - data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs) + data, row_id, enforce_unique_id, check_ragged = popargs('data', 'id', 'enforce_unique_id', 'check_ragged', + kwargs) data = data if data is not None else kwargs bad_data = [] @@ -709,6 +713,11 @@ def add_row(self, **kwargs): c.add_vector(data[colname]) else: c.add_row(data[colname]) + if check_ragged and is_ragged(c.data): + warn(("Data has elements with different lengths and therefore cannot be coerced into an " + "N-dimensional array. Use the 'index' argument when creating a column to add rows " + "with different lengths."), + stacklevel=2) def __eq__(self, other): """Compare if the two DynamicTables contain the same data. @@ -748,6 +757,9 @@ def __eq__(self, other): 'doc': ('class to use to represent the column data. If table=True, this field is ignored and a ' 'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData ' 'object is used.')}, + {'name': 'check_ragged', 'type': bool, 'default': True, + 'doc': ('whether or not to check for ragged arrays when adding data to the table. ' + 'Set to False to avoid checking every element if performance issues occur.')}, allow_extra=True) def add_column(self, **kwargs): # noqa: C901 """ @@ -760,7 +772,7 @@ def add_column(self, **kwargs): # noqa: C901 :raises ValueError: if the column has already been added to the table """ name, data = getargs('name', 'data', kwargs) - index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs) + index, table, enum, col_cls, check_ragged = popargs('index', 'table', 'enum', 'col_cls', 'check_ragged', kwargs) if isinstance(index, VectorIndex): warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be " @@ -823,6 +835,14 @@ def add_column(self, **kwargs): # noqa: C901 # once we have created the column create_vector_index = None if ckwargs.get('data', None) is not None: + + # if no index was provided, check that data is not ragged + if index is False and check_ragged and is_ragged(data): + warn(("Data has elements with different lengths and therefore cannot be coerced into an " + "N-dimensional array. Use the 'index' argument when adding a column of data with " + "different lengths."), + stacklevel=2) + # Check that we are asked to create an index if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0: # Iteratively flatten the data we use for the column based on the depth of the index to generate. diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 57a4bb465..5e0b61539 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -954,6 +954,20 @@ def to_uint_array(arr): raise ValueError('Cannot convert array of dtype %s to uint.' % arr.dtype) +def is_ragged(data): + """ + Test whether a list of lists or array is ragged / jagged + """ + if isinstance(data, (list, tuple)): + lengths = [len(sub_data) if isinstance(sub_data, (list, tuple)) else 1 for sub_data in data] + if len(set(lengths)) > 1: + return True # ragged at this level + + return any(is_ragged(sub_data) for sub_data in data) # check next level + + return False + + class LabelledDict(dict): """A dict wrapper that allows querying by an attribute of the values and running a callable on removed items. diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index 7246a8ba8..d98add060 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -354,6 +354,74 @@ def test_add_column_multi_index(self): ] ) + def test_add_column_without_required_index(self): + """ + Add a column with different element lengths without specifying an index parameter + """ + table = self.with_spec() + table.add_row(foo=5, bar=50.0, baz='lizard') + table.add_row(foo=5, bar=50.0, baz='lizard') + + # testing adding column without a necessary index parameter + lol_data = [[1, 2, 3], [1, 2, 3, 4]] + str_data = [['a', 'b'], ['a', 'b', 'c']] + empty_data = [[1, 2], []] + multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]] + tuple_data = ((1, 2, 3), (1, 2, 3, 4)) + + msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional " + "array. Use the 'index' argument when adding a column of data with different lengths.") + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col1', description='', data=lol_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col2', description='', data=str_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col3', description='', data=empty_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col4', description='', data=multi_nested_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col5', description='', data=tuple_data,) + + def test_add_column_without_required_index_and_no_ragged_check(self): + """ + Add a column with different element lengths without checking for raggedness + """ + lol_data = [[1, 2, 3], [1, 2, 3, 4]] + table = self.with_spec() + table.add_row(foo=5, bar=50.0, baz='lizard') + table.add_row(foo=5, bar=50.0, baz='lizard') + table.add_column(name='col1', description='', data=lol_data, check_ragged=False) + + def test_add_row_without_required_index(self): + """ + Add rows with different element lengths without specifying an index parameter + """ + + # test adding row of list data with different lengths without index parameter + msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional " + "array. Use the 'index' argument when creating a column to add rows with different lengths.") + table = self.with_spec() + table.add_column(name='qux', description='qux column') + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) + with self.assertWarnsWith(UserWarning, msg): + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4]) + + # test adding row of tuple/str data with different lengths without index parameter + table = self.with_spec() + table.add_column(name='qux', description='qux column') + table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b')) + with self.assertWarnsWith(UserWarning, msg): + table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b', 'c')) + + def test_add_row_without_required_index_and_no_ragged_check(self): + """ + Add rows with different element lengths without checking for raggedness + """ + table = self.with_spec() + table.add_column(name='qux', description='qux column') + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4], check_ragged=False) + def test_add_column_auto_index_int(self): """ Add a column as a list of lists after we have already added data so that we need to create a single VectorIndex