From b19b9ddd522a7c37be406b9ee6cc45cb1c25bde3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enoc=20Mart=C3=ADnez?= <enoc.martinez@upc.edu>
Date: Wed, 3 May 2023 12:50:30 +0200
Subject: [PATCH] Adding multidimensional NetCDF file generation

---
 mooda/waterframe/__init__.py              |  2 +-
 mooda/waterframe/output/__init__.py       |  1 +
 mooda/waterframe/output/to_multidim_nc.py | 99 +++++++++++++++++++++++
 3 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 mooda/waterframe/output/to_multidim_nc.py

diff --git a/mooda/waterframe/__init__.py b/mooda/waterframe/__init__.py
index e239fcf..383dd73 100644
--- a/mooda/waterframe/__init__.py
+++ b/mooda/waterframe/__init__.py
@@ -9,7 +9,7 @@ class WaterFrame:
         min, max, copy, use_only, rename, corr, max_diff, time_intervals, resample, slice_time,
         info_metadata, info_vocabulary, drop, reduce_memory, pres2depth, psal2asal,
         asal_temp2dens)
-    from .output import to_nc, to_pkl, to_json, to_es, metadata_to_html, to_csv
+    from .output import to_nc, to_pkl, to_json, to_es, metadata_to_html, to_csv, to_multidim_nc
     from .plot import plot_timeseries, plot_timebar, plot_hist, plot
     from .qc import qc_flat_test, qc_range_test, qc_spike_test, qc_replace, qc_syntax_test
     from .iplot import (
diff --git a/mooda/waterframe/output/__init__.py b/mooda/waterframe/output/__init__.py
index 1e1f751..8c40dce 100644
--- a/mooda/waterframe/output/__init__.py
+++ b/mooda/waterframe/output/__init__.py
@@ -5,3 +5,4 @@
 from .to_es import to_es
 from .metadata_to_html import metadata_to_html
 from .to_csv import to_csv
+from .to_multidim_nc import to_multidim_nc
diff --git a/mooda/waterframe/output/to_multidim_nc.py b/mooda/waterframe/output/to_multidim_nc.py
new file mode 100644
index 0000000..c5f9a33
--- /dev/null
+++ b/mooda/waterframe/output/to_multidim_nc.py
@@ -0,0 +1,99 @@
+""" Function to be imported in a WaterFrame. It save the WaterFrame into a multidimensional NetCDF file. """
+from xarray import Dataset
+import netCDF4 as nc
+import pandas as pd
+import numpy as np
+
+
+def to_multidim_nc(self, path, dimensions: list,  time_key="TIME", compression=True, fill_value=-99999):
+    """
+    It saves the WaterFrame into a multidimensional NetCDF using NetCDF4 library
+
+    Parameters
+    ----------
+        path: str
+            Path to save the NetCDF. If path is None, the filename will be metadata['id'].
+        dimensions: list
+            List of the variable names to be stored as dimensions
+        time_key: str
+            Name of the time column
+        compression: str
+            If True, use zlib compression
+        fill_value: str
+                Fill value
+
+
+    Returns
+    -------
+        path: str
+            Path where the file is placed.
+    """
+
+    # Make sure that time is the last entry in the multiindex
+    if time_key in dimensions:
+        dimensions.remove(time_key)
+        dimensions.append(time_key)
+
+    df = self.data  # Access the DataFrame within the waterframe
+    df = df.reset_index()
+
+    index_df = df[dimensions].copy()  # create a dataframe with only the variables that will be used as indexes
+    multiindex = pd.MultiIndex.from_frame(index_df)  # create a multiindex from the dataframe
+
+    # Arrange other variables into a dict
+    data = {col: df[col].values for col in df.columns if col not in dimensions}
+
+    # Create a dataframe with multiindex
+    data_df = pd.DataFrame(data, index=multiindex)
+
+    dimensions = tuple(dimensions)
+
+    with nc.Dataset(path, "w", format="NETCDF4") as ncfile:
+        for dimension in dimensions:
+            data = index_df[dimension].values
+            values = np.unique(data)  # fixed-length dimension
+            if dimension == time_key:
+                # convert timestamp to float
+                index_df[time_key] = pd.to_datetime(index_df[time_key])
+                times = index_df[time_key].dt.to_pydatetime()
+                values = nc.date2num(times, "seconds since 1970-01-01", calendar="standard")
+
+            ncfile.createDimension(dimension, len(values))  # create dimension
+            if type(values[0]) == str:  # Some dimension may be a string (e.g. sesnor_id)
+                var = ncfile.createVariable(dimension, str, (dimension,), fill_value=fill_value, zlib=compression)
+            else:
+                var = ncfile.createVariable(dimension, 'f8', (dimension,), fill_value=fill_value, zlib=compression)
+
+            var[:] = values  # assign dimension values
+            # add all dimension metadata
+
+            for key, value in self.vocabulary[dimension].items():
+                if type(value) == list:
+                    values = [str(v) for v in value]
+                    value = join_attr.join(values)
+                var.setncattr(key, value)
+
+        for varname in data_df.columns:
+            values = data_df[varname].to_numpy()  # assign values to the variable
+            if varname.endswith("_QC"):
+                # Store Quality Control as unsigned bytes
+                var = ncfile.createVariable(varname, "u1", dimensions, fill_value=fill_value, zlib=compression)
+                var[:] = values.astype(np.int8)
+            else:
+                var = ncfile.createVariable(varname, 'float', dimensions, fill_value=fill_value, zlib=compression)
+                var[:] = values
+
+            # Adding metadata
+            for key, value in self.vocabulary[varname].items():
+                if type(value) == list:
+                    values = [str(v) for v in value]
+                    value = join_attr.join(values)
+                var.setncattr(key, value)
+
+        # Set global attibutes
+        for key, value in self.metadata.items():
+            if type(value) == list:
+                values = [str(v) for v in value]
+                value = join_attr.join(values)
+            ncfile.setncattr(key, value)
+    return path
\ No newline at end of file