Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: polish the bias analysis #4

Merged
merged 17 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ilamb3/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Modular ILAMB methodology functions."""

from ilamb3.analysis.bias import bias_analysis
from ilamb3.analysis.relationship import relationship_analysis

Expand Down
77 changes: 54 additions & 23 deletions ilamb3/analysis/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
com: xr.Dataset,
method: Literal["Collier2018", "RegionalQuantiles"] = "Collier2018",
regions: list[Union[str, None]] = [None],
mass_weighting: bool = False,
use_uncertainty: bool = True,
spatial_sum: bool = False,
mass_weighting: bool = False,
quantile_dbase: Union[pd.DataFrame, None] = None,
quantile_threshold: int = 70,
) -> tuple[pd.DataFrame, xr.Dataset, xr.Dataset]:
Expand All @@ -39,12 +40,16 @@
The name of the scoring methodology to use.
regions
A list of region labels over which to apply the analysis.
mass_weighting
Enable to weight the score map integrals by the temporal mean of the
reference dataset.
use_uncertainty
Enable to utilize uncertainty information from the reference product if
present.
spatial_sum
Enable to report a spatial sum in the period mean as opposed to a spatial
mean. This is often preferred in carbon variables where the total global
carbon is of interest.
mass_weighting
Enable to weight the score map integrals by the temporal mean of the
reference dataset.
quantile_dbase
If using `method='RegionalQuantiles'`, the dataframe containing the regional
quantiles to be used to score the datasets.
Expand Down Expand Up @@ -114,9 +119,17 @@
norm = dset.std_time(ref, varname)

# Nest the grids for comparison, we postpend composite grid variables with "_"
ref_, com_, norm_, uncert_ = cmp.nest_spatial_grids(
ref_mean, com_mean, norm, uncert
)
if dset.is_spatial(ref) and dset.is_spatial(com):
ref_, com_, norm_, uncert_ = cmp.nest_spatial_grids(
ref_mean, com_mean, norm, uncert
)
elif dset.is_site(ref) and dset.is_site(com):
ref_ = ref_mean
com_ = com_mean
norm_ = norm
uncert_ = uncert
else:
raise ValueError("Reference and comparison not uniformly site/spatial.")

Check warning on line 132 in ilamb3/analysis/bias.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/analysis/bias.py#L132

Added line #L132 was not covered by tests

# Compute score by different methods
bias = com_ - ref_
Expand All @@ -142,18 +155,44 @@
ref_out["uncert"] = uncert
com_out = bias.to_dataset(name="bias")
com_out["bias_score"] = score
lat_name = dset.get_dim_name(com_mean, "lat")
lon_name = dset.get_dim_name(com_mean, "lon")
com_out["mean"] = com_mean.rename(
{lat_name: f"{lat_name}_", lon_name: f"{lon_name}_"}
)
try:
lat_name = dset.get_dim_name(com_mean, "lat")
lon_name = dset.get_dim_name(com_mean, "lon")
com_mean = com_mean.rename(
{lat_name: f"{lat_name}_", lon_name: f"{lon_name}_"}
)
except KeyError:
pass
com_out["mean"] = com_mean

# Function for finding a spatial/site mean/sum
def _scalar(var, varname, region, mean=True, weight=False):
da = var
if isinstance(var, xr.Dataset):
da = var[varname]
if dset.is_spatial(da):
da = dset.integrate_space(
da,
varname,
region=region,
mean=mean,
weight=ref_ if (mass_weighting and weight) else None,
)
elif dset.is_site(da):
site_dim = dset.get_dim_name(da, "site")
da = da.pint.dequantify()
da = da.mean(dim=site_dim)
da = da.pint.quantify()
else:
raise ValueError(f"Input is neither spatial nor site: {da}")

Check warning on line 187 in ilamb3/analysis/bias.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/analysis/bias.py#L187

Added line #L187 was not covered by tests
return da

# Compute scalars over all regions
dfs = []
for region in regions:
# Period mean
for src, var in zip(["Reference", "Comparison"], [ref_mean, com_mean]):
var = dset.integrate_space(var, varname, region=region, mean=True)
var = _scalar(var, varname, region, not spatial_sum)
dfs.append(
[
src,
Expand All @@ -166,9 +205,7 @@
]
)
# Bias
bias_scalar = dset.integrate_space(
com_out, "bias", region=region, mean=True
)
bias_scalar = _scalar(com_out, "bias", region, True)
dfs.append(
[
"Comparison",
Expand All @@ -181,13 +218,7 @@
]
)
# Bias Score
bias_scalar_score = dset.integrate_space(
com_out,
"bias_score",
region=region,
mean=True,
weight=ref_ if mass_weighting else None,
)
bias_scalar_score = _scalar(com_out, "bias_score", region, True, True)
dfs.append(
[
"Comparison",
Expand Down
104 changes: 86 additions & 18 deletions ilamb3/compare.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Functions for preparing datasets for comparison."""

import datetime
from typing import Union

import numpy as np
Expand Down Expand Up @@ -136,25 +135,35 @@

def trim_time(dsa: xr.Dataset, dsb: xr.Dataset) -> tuple[xr.Dataset, xr.Dataset]:
"""Return the datasets trimmed to maximal temporal overlap."""
if "time" not in dsa.dims:
return dsa, dsb
if "time" not in dsb.dims:
return dsa, dsb
at0, atf = dset.get_time_extent(dsa)
bt0, btf = dset.get_time_extent(dsb)
tol = datetime.timedelta(days=1) # add some padding
tm0 = max(at0, bt0) - tol # type: ignore
tmf = min(atf, btf) + tol # type: ignore
dsa = dsa.sel(time=slice(tm0, tmf))
dsb = dsb.sel(time=slice(tm0, tmf))

def _to_tuple(da: xr.DataArray) -> tuple[int]:
if da.size != 1:
raise ValueError("Single element conversions only")
return (int(da.dt.year), int(da.dt.month), int(da.dt.day))

Check warning on line 142 in ilamb3/compare.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/compare.py#L140-L142

Added lines #L140 - L142 were not covered by tests

# Get the time extents in the original calendars
ta0, taf = dset.get_time_extent(dsa)
tb0, tbf = dset.get_time_extent(dsb)

Check warning on line 146 in ilamb3/compare.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/compare.py#L146

Added line #L146 was not covered by tests

# Convert to a date tuple (year, month, day) and find the maximal overlap
tmin = max(_to_tuple(ta0), _to_tuple(tb0))
tmax = min(_to_tuple(taf), _to_tuple(tbf))

Check warning on line 150 in ilamb3/compare.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/compare.py#L149-L150

Added lines #L149 - L150 were not covered by tests

# Recast back into native calendar objects and select
dsa = dsa.sel(

Check warning on line 153 in ilamb3/compare.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/compare.py#L153

Added line #L153 was not covered by tests
{"time": slice(ta0.item().__class__(*tmin), taf.item().__class__(*tmax))}
)
dsb = dsb.sel(

Check warning on line 156 in ilamb3/compare.py

View check run for this annotation

Codecov / codecov/patch

ilamb3/compare.py#L156

Added line #L156 was not covered by tests
{"time": slice(tb0.item().__class__(*tmin), tbf.item().__class__(*tmax))}
)
return dsa, dsb


def adjust_lon(dsa: xr.Dataset, dsb: xr.Dataset) -> tuple[xr.Dataset, xr.Dataset]:
"""When comparing dsb to dsa, we need their longitudes uniformly in
[-180,180) or [0,360)."""
alon_name = dset.get_dim_name(dsa, "lon")
blon_name = dset.get_dim_name(dsb, "lon")
alon_name = dset.get_coord_name(dsa, "lon")
blon_name = dset.get_coord_name(dsb, "lon")
if alon_name is None or blon_name is None:
return dsa, dsb
a360 = (dsa[alon_name].min() >= 0) * (dsa[alon_name].max() <= 360)
Expand All @@ -180,13 +189,72 @@
ref: xr.Dataset, com: xr.Dataset, varname: str
) -> tuple[xr.Dataset, xr.Dataset]:
"""Return the datasets in a form where they are comparable."""

# trim away time
ref, com = trim_time(ref, com)
try:
ref, com = trim_time(ref, com)
except KeyError:
pass # no time dimension

# ensure longitudes are uniform
ref, com = adjust_lon(ref, com)

# convert units
# pick just the sites
if dset.is_site(ref[varname]):
com = extract_sites(ref, com, varname)

# convert units, will read in memory so do this last
ref = ref.pint.quantify()
com = dset.convert(com, ref[varname].pint.units, varname=varname)

# ensure longitudes are uniform
ref, com = adjust_lon(ref, com)
return ref, com


def extract_sites(
ds_site: xr.Dataset, ds_spatial: xr.Dataset, varname: str
) -> xr.Dataset:
"""
Extract the `ds_site` lat/lons from `ds_spatial choosing the nearest site.

Parameters
----------
ds_site : xr.Dataset
The dataset which contains sites.
ds_spatial : xr.Dataset
The dataset from which we will make the extraction.
varname : str
The name of the variable of interest.

Returns
-------
xr.Dataset
`ds_spatial` at the sites defined in `da_site`.
"""
# If this throws an exception, this isn't a site data array
dset.get_dim_name(ds_site[varname], "site")

# Get the lat/lon dim names
lat_site = dset.get_coord_name(ds_site, "lat")
lon_site = dset.get_coord_name(ds_site, "lon")
lat_spatial = dset.get_dim_name(ds_spatial, "lat")
lon_spatial = dset.get_dim_name(ds_spatial, "lon")

# Store the mean model resolution
model_res = np.sqrt(
ds_spatial[lon_spatial].diff(dim=lon_spatial).mean() ** 2
+ ds_spatial[lat_spatial].diff(dim=lat_spatial).mean() ** 2
)

# Choose the spatial grid to the nearest site
ds_spatial = ds_spatial.sel(
{lat_spatial: ds_site[lat_site], lon_spatial: ds_site[lon_site]},
method="nearest",
)

# Check that these sites are 'close enough'
dist = np.sqrt(
(ds_site[lat_site] - ds_spatial[lat_spatial]) ** 2
+ (ds_site[lon_site] - ds_spatial[lon_spatial]) ** 2
)
assert (dist < model_res).all()
return ds_spatial
Loading