Skip to content

Commit b9eef4b

Browse files
authored
Numpy array ingestion functionality (#531)
* Add numpy_to_mdio converter for NumPy to MDIO format * Remove redundant blank lines and outdated ImportError docstring * Add numpy converters to reference documentation * Add numpy_to_mdio converter to API * Add integration tests for NumPy to MDIO conversion * Update SEGY reference to exclude 'get_compressor' Excluded the 'get_compressor' function from the SEGY module in the documentation. This ensures only relevant members are displayed, improving clarity for users. * Update documentation structure for convenience functions * Add type hint annotations and clean up formatting * Increase test coverage threshold and update coverage config
1 parent 89cc91f commit b9eef4b

File tree

7 files changed

+306
-12
lines changed

7 files changed

+306
-12
lines changed

docs/reference.md

+12-9
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,21 @@ and
3333
```{eval-rst}
3434
.. automodule:: mdio.converters.segy
3535
:members:
36-
:exclude-members: grid_density_qc, parse_index_types
36+
:exclude-members: grid_density_qc, parse_index_types, get_compressor
3737
3838
.. automodule:: mdio.converters.mdio
3939
:members:
40+
41+
.. automodule:: mdio.converters.numpy
42+
:members:
43+
```
44+
45+
## Convenience Functions
46+
47+
```{eval-rst}
48+
.. automodule:: mdio.api.convenience
49+
:members:
50+
:exclude-members: create_rechunk_plan, write_rechunked_values
4051
```
4152

4253
## Core Functionality
@@ -61,11 +72,3 @@ and
6172
.. automodule:: mdio.core.serialization
6273
:members:
6374
```
64-
65-
## Convenience Functions
66-
67-
```{eval-rst}
68-
.. automodule:: mdio.api.convenience
69-
:members:
70-
:exclude-members: create_rechunk_plan, write_rechunked_values
71-
```

pyproject.toml

+5-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,11 @@ relative_files = true
9898

9999
[tool.coverage.report]
100100
show_missing = true
101-
fail_under = 80
101+
fail_under = 90
102+
exclude_also = [
103+
"if TYPE_CHECKING:",
104+
"raise NotImplementedError",
105+
]
102106

103107
[tool.isort]
104108
profile = "black"

src/mdio/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from mdio.api import MDIOWriter
77
from mdio.api.convenience import copy_mdio
88
from mdio.converters import mdio_to_segy
9+
from mdio.converters import numpy_to_mdio
910
from mdio.converters import segy_to_mdio
1011
from mdio.core.dimension import Dimension
1112
from mdio.core.factory import MDIOCreateConfig
@@ -20,6 +21,7 @@
2021
"MDIOWriter",
2122
"copy_mdio",
2223
"mdio_to_segy",
24+
"numpy_to_mdio",
2325
"segy_to_mdio",
2426
"Dimension",
2527
"MDIOCreateConfig",

src/mdio/converters/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""MDIO Data conversion API."""
22

33
from .mdio import mdio_to_segy
4+
from .numpy import numpy_to_mdio
45
from .segy import segy_to_mdio
56

67

7-
__all__ = ["mdio_to_segy", "segy_to_mdio"]
8+
__all__ = ["mdio_to_segy", "segy_to_mdio", "numpy_to_mdio"]

src/mdio/converters/numpy.py

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""Conversion from Numpy to MDIO."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
import numpy as np
8+
9+
from mdio.api.accessor import MDIOWriter
10+
from mdio.converters.segy import get_compressor
11+
from mdio.core.dimension import Dimension
12+
from mdio.core.factory import MDIOCreateConfig
13+
from mdio.core.factory import MDIOVariableConfig
14+
from mdio.core.factory import create_empty
15+
from mdio.core.grid import Grid
16+
17+
18+
if TYPE_CHECKING:
19+
from typing import Any
20+
21+
from numpy.typing import DTypeLike
22+
from numpy.typing import NDArray
23+
24+
25+
def numpy_to_mdio(
26+
array: NDArray,
27+
mdio_path_or_buffer: str,
28+
chunksize: tuple[int, ...],
29+
index_names: list[str] | None = None,
30+
index_coords: dict[str, NDArray] | None = None,
31+
header_dtype: DTypeLike | None = None,
32+
lossless: bool = True,
33+
compression_tolerance: float = 0.01,
34+
storage_options: dict[str, Any] | None = None,
35+
overwrite: bool = False,
36+
):
37+
"""Conversion from NumPy array to MDIO format.
38+
39+
This module provides functionality to convert a NumPy array into the MDIO
40+
format. The conversion process organizes the input array into a multidimensional
41+
tensor with specified indexing and compression options.
42+
43+
Args:
44+
array: Input NumPy array to be converted to MDIO format.
45+
mdio_path_or_buffer: Output path for the MDIO file, either local or
46+
cloud-based (e.g., with `s3://`, `gcs://`, or `abfs://` protocols).
47+
chunksize: Tuple specifying the chunk sizes for each dimension of the
48+
array. It must match the number of dimensions in the input array.
49+
index_names: List of names for the index dimensions. If not provided,
50+
defaults to `dim_0`, `dim_1`, ..., with the last dimension named
51+
`sample`.
52+
index_coords: Dictionary mapping dimension names to their coordinate
53+
arrays. If not provided, defaults to sequential integers (0 to size-1)
54+
for each dimension.
55+
header_dtype: Data type for trace headers, if applicable. Defaults to None.
56+
lossless: If True, uses lossless Blosc compression with zstandard.
57+
If False, uses ZFP lossy compression (requires `zfpy` library).
58+
compression_tolerance: Tolerance for ZFP compression in lossy mode.
59+
Ignored if `lossless=True`. Default is 0.01, providing ~70% size
60+
reduction.
61+
storage_options: Dictionary of storage options for the MDIO output file
62+
(e.g., cloud credentials). Defaults to None (anonymous access).
63+
overwrite: If True, overwrites existing MDIO file at the specified path.
64+
65+
Raises:
66+
ValueError: If the length of `chunksize` does not match the number of
67+
dimensions in the input array.
68+
ValueError: If an element of `index_names` is not included in the
69+
`index_coords` dictionary.
70+
ValueError: If any coordinate array in `index_coords` has a size that
71+
does not match the corresponding array dimension.
72+
73+
74+
Examples:
75+
To convert a 3D NumPy array to MDIO format locally with default chunking:
76+
77+
>>> import numpy as np
78+
>>> from mdio.converters import numpy_to_mdio
79+
>>>
80+
>>> array = np.random.rand(100, 200, 300)
81+
>>> numpy_to_mdio(
82+
... array=array,
83+
... mdio_path_or_buffer="output/file.mdio",
84+
... chunksize=(64, 64, 64),
85+
... index_names=["inline", "crossline", "sample"],
86+
... )
87+
88+
For a cloud-based output on AWS S3 with custom coordinates:
89+
90+
>>> coords = {
91+
... "inline": np.arange(0, 100, 2),
92+
... "crossline": np.arange(0, 200, 4),
93+
... "sample": np.linspace(0, 0.3, 300),
94+
... }
95+
>>> numpy_to_mdio(
96+
... array=array,
97+
... mdio_path_or_buffer="s3://bucket/file.mdio",
98+
... chunksize=(32, 32, 128),
99+
... index_names=["inline", "crossline", "sample"],
100+
... index_coords=coords,
101+
... lossless=False,
102+
... compression_tolerance=0.01,
103+
... )
104+
105+
To convert a 2D array with default indexing and lossless compression:
106+
107+
>>> array_2d = np.random.rand(500, 1000)
108+
>>> numpy_to_mdio(
109+
... array=array_2d,
110+
... mdio_path_or_buffer="output/file_2d.mdio",
111+
... chunksize=(512, 512),
112+
... )
113+
"""
114+
storage_options = storage_options or {}
115+
116+
if len(chunksize) != array.ndim:
117+
message = (
118+
f"Length of chunks={len(chunksize)} must be ",
119+
f"equal to array dimensions={array.ndim}",
120+
)
121+
raise ValueError(message)
122+
123+
if index_names is None:
124+
index_names = index_names or [f"dim_{i}" for i in range(array.ndim - 1)]
125+
index_names.append("sample")
126+
127+
if index_coords is None:
128+
index_coords = {}
129+
for name, size in zip(index_names, array.shape, strict=True):
130+
index_coords[name] = np.arange(size)
131+
else:
132+
for name, size in zip(index_names, array.shape, strict=True):
133+
if name not in index_coords:
134+
message = f"Index name {name} not found in index_coords"
135+
raise ValueError(message)
136+
137+
if index_coords[name].size != size:
138+
message = (
139+
f"Size of index_coords[{name}]={index_coords[name].size} "
140+
f"does not match array dimension={size}"
141+
)
142+
raise ValueError(message)
143+
144+
suffix = [dim_chunks if dim_chunks > 0 else None for dim_chunks in chunksize]
145+
suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None]
146+
suffix = "".join(suffix)
147+
148+
compressors = get_compressor(lossless, compression_tolerance)
149+
mdio_var = MDIOVariableConfig(
150+
name=f"chunked_{suffix}",
151+
dtype=str(array.dtype),
152+
chunks=chunksize,
153+
compressors=compressors,
154+
header_dtype=header_dtype,
155+
)
156+
157+
dims = [Dimension(name=name, coords=index_coords[name]) for name in index_names]
158+
create_conf = MDIOCreateConfig(
159+
path=mdio_path_or_buffer,
160+
grid=Grid(dims),
161+
variables=[mdio_var],
162+
)
163+
create_empty(create_conf, overwrite, storage_options)
164+
165+
writer = MDIOWriter(mdio_path_or_buffer, suffix, storage_options)
166+
writer[:] = array
167+
writer.stats = {
168+
"mean": array.mean().item(),
169+
"std": array.std().item(),
170+
"rms": np.sqrt((array**2).sum() / array.size).item(),
171+
"min": array.min().item(),
172+
"max": array.max().item(),
173+
}
+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""Module for testing NumPy to MDIO conversion functionality.
2+
3+
This module contains tests for the `numpy_to_mdio` function, ensuring proper conversion
4+
of NumPy arrays to MDIO format, including validation of grid dimensions, chunk sizes,
5+
and coordinate handling.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from typing import TYPE_CHECKING
11+
12+
import numpy as np
13+
import numpy.testing as npt
14+
import pytest
15+
16+
from mdio.api.accessor import MDIOReader
17+
from mdio.converters.numpy import numpy_to_mdio
18+
from mdio.core.dimension import Dimension
19+
from mdio.core.grid import Grid
20+
21+
22+
if TYPE_CHECKING:
23+
from numpy.typing import NDArray
24+
25+
26+
TEST_DIMS = [
27+
Dimension(name="inline", coords=np.arange(101, 131, 2)),
28+
Dimension(name="crossline", coords=np.arange(10, 20, 1)),
29+
Dimension(name="sample", coords=np.arange(0, 100, 5)),
30+
]
31+
32+
33+
@pytest.fixture
34+
def mock_grid() -> Grid:
35+
"""Make a mock grid using test dimensions."""
36+
return Grid(dims=TEST_DIMS)
37+
38+
39+
@pytest.fixture
40+
def mock_array(mock_grid: Grid) -> NDArray:
41+
"""Make a mock array using mock grid."""
42+
rng = np.random.default_rng()
43+
return rng.uniform(size=mock_grid.shape).astype("float32")
44+
45+
46+
CHUNK_SIZE = (8, 8, 8)
47+
48+
49+
def test_npy_to_mdio(mock_array: NDArray, mock_grid: Grid) -> None:
50+
"""Test basic NumPy to MDIO conversion without custom coordinates."""
51+
numpy_to_mdio(mock_array, "memory://npy.mdio", CHUNK_SIZE)
52+
reader = MDIOReader("memory://npy.mdio")
53+
54+
npt.assert_array_equal(reader._traces, mock_array)
55+
assert reader.grid.dim_names == ("dim_0", "dim_1", "sample")
56+
assert reader.chunks == CHUNK_SIZE
57+
assert reader.shape == mock_grid.shape
58+
assert reader.grid.dims != mock_grid.dims
59+
60+
61+
def test_npy_to_mdio_coords(mock_array: NDArray, mock_grid: Grid) -> None:
62+
"""Test NumPy to MDIO conversion with custom coordinates."""
63+
index_names = mock_grid.dim_names
64+
index_coords = {dim.name: dim.coords for dim in mock_grid.dims}
65+
numpy_to_mdio(
66+
mock_array, "memory://npy_coord.mdio", CHUNK_SIZE, index_names, index_coords
67+
)
68+
reader = MDIOReader("memory://npy_coord.mdio")
69+
70+
npt.assert_array_equal(reader._traces, mock_array)
71+
assert reader.chunks == CHUNK_SIZE
72+
assert reader.shape == mock_grid.shape
73+
assert reader.grid.dims == mock_grid.dims
74+
75+
76+
def test_npy_to_mdio_chunksize_mismatch(mock_array: NDArray, mock_grid: Grid) -> None:
77+
"""Test error handling for mismatched chunk size dimensions."""
78+
with pytest.raises(ValueError, match="equal to array dimensions"):
79+
numpy_to_mdio(mock_array, "", (5, 10, 15, 20, 25))
80+
81+
82+
def test_npy_to_mdio_coord_missing(mock_array: NDArray, mock_grid: Grid) -> None:
83+
"""Test error handling for missing coordinate names."""
84+
index_names = ["mismatch", "dimension", "names"]
85+
index_coords = {dim.name: dim.coords for dim in mock_grid.dims}
86+
87+
with pytest.raises(ValueError, match="not found in index_coords"):
88+
numpy_to_mdio(
89+
mock_array,
90+
"",
91+
CHUNK_SIZE,
92+
index_names,
93+
index_coords,
94+
)
95+
96+
97+
def test_npy_to_mdio_coord_size_error(mock_array: NDArray, mock_grid: Grid) -> None:
98+
"""Test error handling for coordinate size mismatch."""
99+
index_names = mock_grid.dim_names
100+
index_coords = {dim.name: np.arange(5) for dim in mock_grid.dims}
101+
102+
with pytest.raises(ValueError, match="does not match array dimension"):
103+
numpy_to_mdio(
104+
mock_array,
105+
"",
106+
CHUNK_SIZE,
107+
index_names,
108+
index_coords,
109+
)

tests/unit/test_auto_chunking.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Test live mask chunk size calculation."""
22

3+
from __future__ import annotations
4+
35
from typing import TYPE_CHECKING
46

57
import numpy as np
@@ -30,7 +32,7 @@
3032
@pytest.mark.filterwarnings("ignore:chunk size balancing not possible:UserWarning")
3133
def test_auto_chunking(
3234
shape: tuple[int, ...],
33-
dtype: "DTypeLike",
35+
dtype: DTypeLike,
3436
limit: int,
3537
expected_chunks: tuple[int, ...],
3638
) -> None:

0 commit comments

Comments
 (0)