|
| 1 | +"""Conversion from Numpy to MDIO.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from typing import TYPE_CHECKING |
| 6 | + |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +from mdio.api.accessor import MDIOWriter |
| 10 | +from mdio.converters.segy import get_compressor |
| 11 | +from mdio.core.dimension import Dimension |
| 12 | +from mdio.core.factory import MDIOCreateConfig |
| 13 | +from mdio.core.factory import MDIOVariableConfig |
| 14 | +from mdio.core.factory import create_empty |
| 15 | +from mdio.core.grid import Grid |
| 16 | + |
| 17 | + |
| 18 | +if TYPE_CHECKING: |
| 19 | + from typing import Any |
| 20 | + |
| 21 | + from numpy.typing import DTypeLike |
| 22 | + from numpy.typing import NDArray |
| 23 | + |
| 24 | + |
| 25 | +def numpy_to_mdio( |
| 26 | + array: NDArray, |
| 27 | + mdio_path_or_buffer: str, |
| 28 | + chunksize: tuple[int, ...], |
| 29 | + index_names: list[str] | None = None, |
| 30 | + index_coords: dict[str, NDArray] | None = None, |
| 31 | + header_dtype: DTypeLike | None = None, |
| 32 | + lossless: bool = True, |
| 33 | + compression_tolerance: float = 0.01, |
| 34 | + storage_options: dict[str, Any] | None = None, |
| 35 | + overwrite: bool = False, |
| 36 | +): |
| 37 | + """Conversion from NumPy array to MDIO format. |
| 38 | +
|
| 39 | + This module provides functionality to convert a NumPy array into the MDIO |
| 40 | + format. The conversion process organizes the input array into a multidimensional |
| 41 | + tensor with specified indexing and compression options. |
| 42 | +
|
| 43 | + Args: |
| 44 | + array: Input NumPy array to be converted to MDIO format. |
| 45 | + mdio_path_or_buffer: Output path for the MDIO file, either local or |
| 46 | + cloud-based (e.g., with `s3://`, `gcs://`, or `abfs://` protocols). |
| 47 | + chunksize: Tuple specifying the chunk sizes for each dimension of the |
| 48 | + array. It must match the number of dimensions in the input array. |
| 49 | + index_names: List of names for the index dimensions. If not provided, |
| 50 | + defaults to `dim_0`, `dim_1`, ..., with the last dimension named |
| 51 | + `sample`. |
| 52 | + index_coords: Dictionary mapping dimension names to their coordinate |
| 53 | + arrays. If not provided, defaults to sequential integers (0 to size-1) |
| 54 | + for each dimension. |
| 55 | + header_dtype: Data type for trace headers, if applicable. Defaults to None. |
| 56 | + lossless: If True, uses lossless Blosc compression with zstandard. |
| 57 | + If False, uses ZFP lossy compression (requires `zfpy` library). |
| 58 | + compression_tolerance: Tolerance for ZFP compression in lossy mode. |
| 59 | + Ignored if `lossless=True`. Default is 0.01, providing ~70% size |
| 60 | + reduction. |
| 61 | + storage_options: Dictionary of storage options for the MDIO output file |
| 62 | + (e.g., cloud credentials). Defaults to None (anonymous access). |
| 63 | + overwrite: If True, overwrites existing MDIO file at the specified path. |
| 64 | +
|
| 65 | + Raises: |
| 66 | + ValueError: If the length of `chunksize` does not match the number of |
| 67 | + dimensions in the input array. |
| 68 | + ValueError: If an element of `index_names` is not included in the |
| 69 | + `index_coords` dictionary. |
| 70 | + ValueError: If any coordinate array in `index_coords` has a size that |
| 71 | + does not match the corresponding array dimension. |
| 72 | +
|
| 73 | +
|
| 74 | + Examples: |
| 75 | + To convert a 3D NumPy array to MDIO format locally with default chunking: |
| 76 | +
|
| 77 | + >>> import numpy as np |
| 78 | + >>> from mdio.converters import numpy_to_mdio |
| 79 | + >>> |
| 80 | + >>> array = np.random.rand(100, 200, 300) |
| 81 | + >>> numpy_to_mdio( |
| 82 | + ... array=array, |
| 83 | + ... mdio_path_or_buffer="output/file.mdio", |
| 84 | + ... chunksize=(64, 64, 64), |
| 85 | + ... index_names=["inline", "crossline", "sample"], |
| 86 | + ... ) |
| 87 | +
|
| 88 | + For a cloud-based output on AWS S3 with custom coordinates: |
| 89 | +
|
| 90 | + >>> coords = { |
| 91 | + ... "inline": np.arange(0, 100, 2), |
| 92 | + ... "crossline": np.arange(0, 200, 4), |
| 93 | + ... "sample": np.linspace(0, 0.3, 300), |
| 94 | + ... } |
| 95 | + >>> numpy_to_mdio( |
| 96 | + ... array=array, |
| 97 | + ... mdio_path_or_buffer="s3://bucket/file.mdio", |
| 98 | + ... chunksize=(32, 32, 128), |
| 99 | + ... index_names=["inline", "crossline", "sample"], |
| 100 | + ... index_coords=coords, |
| 101 | + ... lossless=False, |
| 102 | + ... compression_tolerance=0.01, |
| 103 | + ... ) |
| 104 | +
|
| 105 | + To convert a 2D array with default indexing and lossless compression: |
| 106 | +
|
| 107 | + >>> array_2d = np.random.rand(500, 1000) |
| 108 | + >>> numpy_to_mdio( |
| 109 | + ... array=array_2d, |
| 110 | + ... mdio_path_or_buffer="output/file_2d.mdio", |
| 111 | + ... chunksize=(512, 512), |
| 112 | + ... ) |
| 113 | + """ |
| 114 | + storage_options = storage_options or {} |
| 115 | + |
| 116 | + if len(chunksize) != array.ndim: |
| 117 | + message = ( |
| 118 | + f"Length of chunks={len(chunksize)} must be ", |
| 119 | + f"equal to array dimensions={array.ndim}", |
| 120 | + ) |
| 121 | + raise ValueError(message) |
| 122 | + |
| 123 | + if index_names is None: |
| 124 | + index_names = index_names or [f"dim_{i}" for i in range(array.ndim - 1)] |
| 125 | + index_names.append("sample") |
| 126 | + |
| 127 | + if index_coords is None: |
| 128 | + index_coords = {} |
| 129 | + for name, size in zip(index_names, array.shape, strict=True): |
| 130 | + index_coords[name] = np.arange(size) |
| 131 | + else: |
| 132 | + for name, size in zip(index_names, array.shape, strict=True): |
| 133 | + if name not in index_coords: |
| 134 | + message = f"Index name {name} not found in index_coords" |
| 135 | + raise ValueError(message) |
| 136 | + |
| 137 | + if index_coords[name].size != size: |
| 138 | + message = ( |
| 139 | + f"Size of index_coords[{name}]={index_coords[name].size} " |
| 140 | + f"does not match array dimension={size}" |
| 141 | + ) |
| 142 | + raise ValueError(message) |
| 143 | + |
| 144 | + suffix = [dim_chunks if dim_chunks > 0 else None for dim_chunks in chunksize] |
| 145 | + suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None] |
| 146 | + suffix = "".join(suffix) |
| 147 | + |
| 148 | + compressors = get_compressor(lossless, compression_tolerance) |
| 149 | + mdio_var = MDIOVariableConfig( |
| 150 | + name=f"chunked_{suffix}", |
| 151 | + dtype=str(array.dtype), |
| 152 | + chunks=chunksize, |
| 153 | + compressors=compressors, |
| 154 | + header_dtype=header_dtype, |
| 155 | + ) |
| 156 | + |
| 157 | + dims = [Dimension(name=name, coords=index_coords[name]) for name in index_names] |
| 158 | + create_conf = MDIOCreateConfig( |
| 159 | + path=mdio_path_or_buffer, |
| 160 | + grid=Grid(dims), |
| 161 | + variables=[mdio_var], |
| 162 | + ) |
| 163 | + create_empty(create_conf, overwrite, storage_options) |
| 164 | + |
| 165 | + writer = MDIOWriter(mdio_path_or_buffer, suffix, storage_options) |
| 166 | + writer[:] = array |
| 167 | + writer.stats = { |
| 168 | + "mean": array.mean().item(), |
| 169 | + "std": array.std().item(), |
| 170 | + "rms": np.sqrt((array**2).sum() / array.size).item(), |
| 171 | + "min": array.min().item(), |
| 172 | + "max": array.max().item(), |
| 173 | + } |
0 commit comments