Skip to content

Commit

Permalink
resolve conflict and merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
Jacob Mathias Schreiner committed Feb 14, 2025
2 parents 7fc6f37 + 144ca10 commit 7b2845b
Show file tree
Hide file tree
Showing 8 changed files with 319 additions and 17 deletions.
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,10 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.14.1
hooks:
- id: mypy
additional_dependencies: [types-PyYAML, types-Pillow, types-tqdm]
description: Check for type errors
files: ^mllam_data_prep/
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby
- add support for zarr 3.0.0 and above [\#51](https://github.com/mllam/mllam-data-prep/pull/51), @kashif
- warn if the user tries to load a non-YAML file [\#50](https://github.com/mllam/mllam-data-prep/pull/50), @j6k4m8
- add mypy typing action to pre-commit hooks [\#67](https://github.com/mllam/mllam-data-prep/pull/67), @observingClouds

### Fixes

- fix bug which adds unwanted dimensions to the dataset [\#60](https://github.com/mllam/mllam-data-prep/pull/60), @ealerskans, @observingClouds
- correct chunk size estimate [\#59](https://github.com/mllam/mllam-data-prep/pull/59), @ealerskans
- fix bug arising when variables provided to derived functions are renamed [\#56](https://github.com/mllam/mllam-data-prep/pull/56), @leifdenby
- ensure config fields defaulting to `None` are typed as `Optional` and fields defaulting to `{}` are given a default-factory so that serialization with default values works correctly [\#63](https://github.com/mllam/mllam-data-prep/pull/63), @leifdenby
- fix reading of exported config files [\#67](https://github.com/mllam/mllam-data-prep/pull/67), @observingClouds

### Maintenance

Expand Down
2 changes: 1 addition & 1 deletion mllam_data_prep/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import psutil
from dask.diagnostics import ProgressBar
from dask.distributed import LocalCluster
except ImportError or ModuleNotFoundError:
except (ImportError, ModuleNotFoundError):
DASK_DISTRIBUTED_AVAILABLE = False

if __name__ == "__main__":
Expand Down
16 changes: 8 additions & 8 deletions mllam_data_prep/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class ValueSelection:
"""

values: Union[List[Union[float, int]], Range]
units: str = None
units: Optional[str] = None


@dataclass
Expand Down Expand Up @@ -178,8 +178,8 @@ class DimMapping:
method: str
dims: Optional[List[str]] = None
dim: Optional[str] = None
name_format: str = field(default=None)
coord_ranges: Dict[str, Range] = None
name_format: Optional[str] = field(default=None)
coord_ranges: Optional[Dict[str, Range]] = field(default_factory=dict)


@dataclass
Expand Down Expand Up @@ -236,7 +236,7 @@ class InputDataset:
variables: Optional[Union[List[str], Dict[str, Dict[str, ValueSelection]]]] = None
derived_variables: Optional[Dict[str, DerivedVariable]] = None
attributes: Optional[Dict[str, Any]] = field(default_factory=dict)
coord_ranges: Dict[str, Range] = None
coord_ranges: Optional[Dict[str, Range]] = None


@dataclass
Expand Down Expand Up @@ -276,7 +276,7 @@ class Split:

start: str
end: str
compute_statistics: Statistics = None
compute_statistics: Optional[Statistics] = None


@dataclass
Expand Down Expand Up @@ -335,9 +335,9 @@ class Output:
"""

variables: Dict[str, List[str]]
coord_ranges: Dict[str, Range] = None
coord_ranges: Dict[str, Range] = field(default_factory=dict)
chunking: Dict[str, int] = field(default_factory=dict)
splitting: Splitting = None
splitting: Optional[Splitting] = None


@dataclass
Expand Down Expand Up @@ -374,7 +374,7 @@ class Config(dataclass_wizard.JSONWizard, dataclass_wizard.YAMLWizard):
inputs: Dict[str, InputDataset]
schema_version: str
dataset_version: str
extra: Dict[str, Any] = None
extra: Dict[str, Any] = field(default_factory=dict)

def __post_init__(self):
validate_config(self.inputs)
Expand Down
10 changes: 5 additions & 5 deletions mllam_data_prep/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shutil
from collections import defaultdict
from pathlib import Path
from typing import Optional

import numpy as np
import xarray as xr
Expand Down Expand Up @@ -128,7 +129,7 @@ def create_dataset(config: Config):
f" {', '.join(SUPPORTED_CONFIG_VERSIONS)} are supported by mllam-data-prep "
f"v{__version__}."
)
if config.schema_version == "v0.2.0" and config.extra is not None:
if config.schema_version == "v0.2.0" and config.extra:
raise ValueError(
"Config schema version v0.2.0 does not support the `extra` field. Please "
"update the schema version used in your config to v0.5.0."
Expand Down Expand Up @@ -159,11 +160,9 @@ def create_dataset(config: Config):
if input_config.coord_ranges is not None:
ds_input = selection.select_by_kwargs(ds_input, **input_config.coord_ranges)

# Initialize the output dataset and add dimensions
# Initialize the output dataset
ds = xr.Dataset()
ds.attrs.update(ds_input.attrs)
for dim in ds_input.dims:
ds = ds.assign_coords({dim: ds_input.coords[dim]})

if selected_variables:
logger.info(f"Extracting selected variables from dataset {dataset_name}")
Expand All @@ -189,6 +188,7 @@ def create_dataset(config: Config):
ds=ds_input,
derived_variable=derived_variable,
chunking=chunking_config,
target_dims=expected_input_var_dims,
)

_check_dataset_attributes(
Expand Down Expand Up @@ -294,7 +294,7 @@ def create_dataset(config: Config):
return ds


def create_dataset_zarr(fp_config, fp_zarr: str = None):
def create_dataset_zarr(fp_config: Path, fp_zarr: Optional[str | Path] = None):
"""
Create a dataset from the input datasets specified in the config file and write it to a zarr file.
The path to the zarr file is the same as the config file, but with the extension changed to '.zarr'.
Expand Down
7 changes: 4 additions & 3 deletions mllam_data_prep/ops/derive_variable/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"]


def derive_variable(ds, derived_variable, chunking):
def derive_variable(ds, derived_variable, chunking, target_dims):
"""
Derive a variable using the `function` and `kwargs` of `derived_variable`.
Expand All @@ -33,15 +33,16 @@ def derive_variable(ds, derived_variable, chunking):
chunking: Dict[str, int]
Dictionary with keys as the dimensions to chunk along and values
with the chunk size
target_dims: List[str]
List of dims from ds to broadcast derived variable to,
if not used in calculation
Returns
-------
xr.Dataset
Dataset with derived variables included
"""

target_dims = list(ds.sizes.keys())

function_namespace = derived_variable.function
expected_field_attributes = derived_variable.attrs

Expand Down
10 changes: 10 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,13 @@ def test_that_range_accepts_datetime():
step = "PT3H"

range_ = config.Range(start=start, end=end, step=step)


def test_config_roundtrip():
original_config = mdp.Config.from_yaml(VALID_EXAMPLE_CONFIG_YAML)
roundtrip_config_dict = mdp.Config.from_dict(original_config.to_dict())
roundtrip_config_yaml = mdp.Config.from_yaml(original_config.to_yaml())
roundtrip_config_json = mdp.Config.from_json(original_config.to_json())
assert original_config == roundtrip_config_dict
assert original_config == roundtrip_config_yaml
assert original_config == roundtrip_config_json
Loading

0 comments on commit 7b2845b

Please sign in to comment.