diff --git a/.gitignore b/.gitignore index 3a8562e..010e8d0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ build dev.ipynb geoglows.egg-info dist -.pypirc \ No newline at end of file +.pypirc +*.parquet \ No newline at end of file diff --git a/README.md b/README.md index b48e002..bbb7e9c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# GEOGloWS +# GEOGLOWS [![Conda Version](https://img.shields.io/conda/vn/conda-forge/geoglows.svg)](https://img.shields.io/conda/vn/conda-forge/geoglows.svg) [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/geoglows.svg)](https://anaconda.org/conda-forge/geoglows) [![PyPI](https://img.shields.io/pypi/v/geoglows)](https://pypi.org/project/geoglows) @@ -16,8 +16,6 @@ Also available on pypi pip install geoglows ``` -Tools to access and process data coming from the GEOGloWS initiative. +Tools to access and process data coming from the GEOGLOWS initiative. https://geoglows.readthedocs.io - -The sample data are from ReachID 13073600 which is on the mississippi river \ No newline at end of file diff --git a/docs/api-documentation.rst b/docs/api-documentation.rst index 85c13d8..5cd35dd 100644 --- a/docs/api-documentation.rst +++ b/docs/api-documentation.rst @@ -7,11 +7,10 @@ There are 3 modules in the geoglows package. .. toctree:: :maxdepth: 3 - api-documentation/streamflow + api-documentation/data api-documentation/bias api-documentation/plots - api-documentation/analysis - api-documentation/examples + api-documentation/analyze FAQ @@ -19,7 +18,7 @@ FAQ How do I save streamflow data to csv? ------------------------------------- -By default, the results of most of the `geoglows.streamflow` functions return a pandas DataFrame. You can save those to +By default, the results of most of the `geoglows.data` functions return a pandas DataFrame. You can save those to a csv, json, pickle, or other file. For example, save to csv with the dataframe's ``.to_csv()`` method. .. code-block:: python diff --git a/docs/api-documentation/analysis.rst b/docs/api-documentation/analysis.rst deleted file mode 100644 index d2017a5..0000000 --- a/docs/api-documentation/analysis.rst +++ /dev/null @@ -1,11 +0,0 @@ -================= -geoglows.analysis -================= - -Analysis -~~~~~~~~ -Functions which post process results from the streamflow data service into additional, useful products - -.. automodule:: geoglows.analysis - :members: - compute_daily_average, compute_monthly_average, compute_return_periods, compute_anomaly diff --git a/docs/api-documentation/analyze.rst b/docs/api-documentation/analyze.rst new file mode 100644 index 0000000..ac63474 --- /dev/null +++ b/docs/api-documentation/analyze.rst @@ -0,0 +1,11 @@ +================ +geoglows.analyze +================ + +Analyze +~~~~~~~ +Functions which post process results from the streamflow data service into additional, useful products + +.. automodule:: geoglows.analyze + :members: gumbel1, simple_forecast, forecast_stats, daily_averages, monthly_averages, annual_averages, daily_stats, + daily_variance, daily_flow_anomaly, return_periods, low_return_periods diff --git a/docs/api-documentation/data.rst b/docs/api-documentation/data.rst new file mode 100644 index 0000000..611c52d --- /dev/null +++ b/docs/api-documentation/data.rst @@ -0,0 +1,30 @@ +============= +geoglows.data +============= + +The data module provides functions for requesting forecasted and historical data river discharge simulations. +The data can be retrieved from the REST data service hosted by ECMWF or it can be retrieved from the repository sponsored +by the AWS Open Data Program. The speed and reliability of the AWS source is typically better than the REST service. + +In general, each function requires a river ID. The name for the ID varies based on the streams network dataset. It is called +LINKNO in GEOGLOWS which uses the TDX-Hydro streams dataset. This is the same as a reach_id or common id (COMID) used previously. +To find a LINKNO (river ID number), please refer to https://data.geoglows.org and browse the tutorials. + +Forecasted Streamflow +--------------------- + +Historical Simulation +--------------------- + +.. automodule:: geoglows.data + :members: + retrospective, return_periods, annual_averages, monthly_averages, daily_averages, + :noindex: + +GEOGLOWS Model Utilities +------------------------ + +.. automodule:: geoglows.data + :members: + metadata_tables + :noindex: diff --git a/docs/api-documentation/examples.rst b/docs/api-documentation/examples.rst deleted file mode 100644 index 7caf247..0000000 --- a/docs/api-documentation/examples.rst +++ /dev/null @@ -1,8 +0,0 @@ -================= -geoglows.examples -================= - -.. automodule:: geoglows.examples - :members: - stats, ens, hist, rper, rec - :noindex: diff --git a/docs/api-documentation/streamflow.rst b/docs/api-documentation/streamflow.rst deleted file mode 100644 index 15ec00c..0000000 --- a/docs/api-documentation/streamflow.rst +++ /dev/null @@ -1,34 +0,0 @@ -=================== -geoglows.streamflow -=================== - -The streamflow module provides a series of functions for requesting forecasted and historical data from the GEOGloWS -ECMWF Streamflow Service. This data is available from this service through a REST API. Simply put, a REST API is a way -to get information over the internet without using a web browser. - -In general, a method requires an ID, called the reach_id or common id (COMID), for a specific stream. This ID is unique -to the stream network configured for this Service. It is arbitrarily assigned so that there is a way to keep data -organized. - -Forecasted Streamflow ---------------------- - -.. automodule:: geoglows.streamflow - :members: - forecast_stats, forecast_ensembles, forecast_warnings, forecast_records - -Historically Simulated Streamflow ---------------------------------- - -.. automodule:: geoglows.streamflow - :members: - historic_simulation, return_periods, daily_averages, monthly_averages - :noindex: - -GEOGloWS Model Utilities ------------------------- - -.. automodule:: geoglows.streamflow - :members: - available_data, available_regions, available_dates, reach_to_region, reach_to_latlon, latlon_to_reach, latlon_to_region - :noindex: diff --git a/docs/dev-notes.rst b/docs/dev-notes.rst deleted file mode 100644 index da448d9..0000000 --- a/docs/dev-notes.rst +++ /dev/null @@ -1,39 +0,0 @@ -================ -Developing Notes -================ - -Packaging for PyPi -~~~~~~~~~~~~~~~~~~ -Sign up for an account on pypi.org - -.. code-block:: bash - - pip install twine - python setup.py sdist - twine upload dist/name_of_dist.tar.gz - -Packaging for Conda/Anaconda Cloud -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Sign up for an account on anaconda.org - -useful reference - -https://docs.conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html - -https://docs.conda.io/projects/conda/en/latest/user-guide/configuration/use-condarc.html - -.. code-block:: bash - - conda install anaconda-client conda-build - conda config --set anaconda_upload no - conda build . --no-test - anaconda upload /path/to/distribution -u user_or_group_name -l label - -Developing documentation -~~~~~~~~~~~~~~~~~~~~~~~~ -Use Sphinx - -.. code-block:: bash - - conda install sphinx sphinx_rtd_theme - make html \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index adfcd1c..0dba9bc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,8 +7,8 @@ geoglows .. image:: https://anaconda.org/geoglows/geoglows/badges/latest_release_date.svg :target: https://anaconda.org/geoglows/geoglows -The geoglows python package is intended to promote access to data, API's, and code developed for the `GEOGloWS ECMWF Streamflow Model `_. -Read more about GEOGloWS at ``_ +The geoglows Python package enables access to data, API's, and code developed for the `GEOGLOWS Streamflow Model `_. +Read more about GEOGLOWS at ``_ Demos ===== @@ -16,13 +16,13 @@ These links will be maintained to reference the most updated versions of the tut The tutorials are GitHub Gists which you can copy and launch in a Google Collaboratory setting directly from the GitHub. - Retrieve & plot GEOGloWS model data: ``_ -- Finding Stream ID #'s programatically: ``_ +- Finding Stream ID #'s programmatically: ``_ - Bias Evaluation and Calibration at a point: ``_ - Generate/Download High Res Plot Images: ``_ About GEOGloWS ECMWF Streamflow =============================== -GEOGloWS ECMWF Streamflow Project: This project provides access to the results of a hydrologic model that is run each +GEOGLOWS ECMWF Streamflow Project: This project provides access to the results of a hydrologic model that is run each day. The model is based on a group of unique weather forecasts, known as an ensemble, from ECMWF. Each unique precipitation forecast, known as an ensemble member, produces a unique streamflow forecast. There are 52 members of the ensemble that drives the model each day. The ERA-5 historical precipitation dataset to also used to produce a @@ -34,5 +34,4 @@ hindcasted streamflow on each river. `Read more here =3 + - dask + - fastparquet + - requests + - pandas + - plotly>=5 + - jinja2 + - shapely>=2 + - scipy>=1 + - s3fs + - numpy>=1 + - hydrostats + - HydroErr + - xarray + - zarr diff --git a/geoglows/__init__.py b/geoglows/__init__.py index a468fc9..28d51e5 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -1,10 +1,16 @@ import geoglows.bias -import geoglows.plots -import geoglows.streamflow -import geoglows.analysis -import geoglows.examples +import geoglows._plots as plots +import geoglows.data +import geoglows.analyze +import geoglows.streams +import geoglows.tables -__all__ = ['bias', 'plots', 'streamflow', 'analysis'] -__version__ = '0.27.1' +from ._constants import METADATA_TABLE_LOCAL_PATH as METADATA_TABLE_PATH + +__all__ = [ + 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', + 'METADATA_TABLE_PATH' +] +__version__ = '1.0.0' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_constants.py b/geoglows/_constants.py new file mode 100644 index 0000000..08d0cbc --- /dev/null +++ b/geoglows/_constants.py @@ -0,0 +1,3 @@ +import os + +METADATA_TABLE_LOCAL_PATH = os.path.join(os.path.dirname(__file__), 'data', 'metadata-tables.parquet') diff --git a/geoglows/_plots/__init__.py b/geoglows/_plots/__init__.py new file mode 100644 index 0000000..8206376 --- /dev/null +++ b/geoglows/_plots/__init__.py @@ -0,0 +1,4 @@ +from .plots import * + + +__all__ = [] diff --git a/geoglows/_plots/format_tools.py b/geoglows/_plots/format_tools.py new file mode 100644 index 0000000..41cdbf2 --- /dev/null +++ b/geoglows/_plots/format_tools.py @@ -0,0 +1,28 @@ +from plotly.offline import plot as offline_plot + + +def build_title(main_title, plot_titles: list): + if plot_titles is not None: + main_title += '
'.join(plot_titles) + return main_title + + +def return_period_plot_colors(): + return { + '2 Year': 'rgba(254, 240, 1, .4)', + '5 Year': 'rgba(253, 154, 1, .4)', + '10 Year': 'rgba(255, 56, 5, .4)', + '20 Year': 'rgba(128, 0, 246, .4)', + '25 Year': 'rgba(255, 0, 0, .4)', + '50 Year': 'rgba(128, 0, 106, .4)', + '100 Year': 'rgba(128, 0, 246, .4)', + } + + +def plotly_figure_to_html_plot(figure, include_plotlyjs: bool = False, ) -> str: + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=include_plotlyjs + ) diff --git a/geoglows/_plots/plotly_bias_corrected.py b/geoglows/_plots/plotly_bias_corrected.py new file mode 100644 index 0000000..2340eb1 --- /dev/null +++ b/geoglows/_plots/plotly_bias_corrected.py @@ -0,0 +1,352 @@ +import hydrostats.data as hd +import pandas as pd +import plotly.graph_objects as go +import scipy.stats +from plotly.offline import plot as offline_plot + +from .format_tools import build_title +from .plotly_helpers import _rperiod_scatters + + +__all__ = [ + 'corrected_historical', + 'corrected_scatterplots', + 'corrected_day_average', + 'corrected_month_average', + 'corrected_volume_compare', +] + + +# BIAS CORRECTION PLOTS +def corrected_historical(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, + rperiods: pd.DataFrame = None, plot_titles: list = None, + plot_type: str = 'plotly') -> go.Figure or str: + """ + Creates a plot of corrected discharge, observed discharge, and simulated discharge + + Args: + corrected: the response from the geoglows.bias.correct_historical_simulation function\ + simulated: the csv response from historic_simulation + observed: the dataframe of observed data. Must have a datetime index and a single column of flow values + rperiods: the csv response from return_periods + plot_type: either 'plotly', or 'plotly_html' (default plotly) + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Returns: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + startdate = corrected.index[0] + enddate = corrected.index[-1] + + plot_data = { + 'x_simulated': corrected.index.tolist(), + 'x_observed': observed.index.tolist(), + 'y_corrected': corrected.values.flatten(), + 'y_simulated': simulated.values.flatten(), + 'y_observed': observed.values.flatten(), + 'y_max': max(corrected.values.max(), observed.values.max(), simulated.values.max()), + } + if rperiods is not None: + plot_data.update(rperiods.to_dict(orient='index').items()) + rperiod_scatters = _rperiod_scatters(startdate, enddate, rperiods, plot_data['y_max'], plot_data['y_max']) + else: + rperiod_scatters = [] + + if plot_type == 'json': + return plot_data + + scatters = [ + go.Scatter( + name='Simulated Data', + x=plot_data['x_simulated'], + y=plot_data['y_simulated'], + line=dict(color='red') + ), + go.Scatter( + name='Observed Data', + x=plot_data['x_observed'], + y=plot_data['y_observed'], + line=dict(color='blue') + ), + go.Scatter( + name='Corrected Simulated Data', + x=plot_data['x_simulated'], + y=plot_data['y_corrected'], + line=dict(color='#00cc96') + ), + ] + scatters += rperiod_scatters + + layout = go.Layout( + title=build_title("Historical Simulation Comparison", plot_titles), + yaxis={'title': 'Discharge (m3/s)'}, + xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate], 'hoverformat': '%b %d %Y', + 'tickformat': '%Y'}, + ) + + figure = go.Figure(data=scatters, layout=layout) + if plot_type == 'plotly': + return figure + if plot_type == 'plotly_html': + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=False + ) + raise ValueError('Invalid plot_type chosen. Choose json, plotly, plotly_scatters, or plotly_html') + + +def corrected_scatterplots(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, + merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, + plot_titles: list = None, ) -> go.Figure or str: + """ + Creates a plot of corrected discharge, observed discharge, and simulated discharge. This function uses + hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full + comparison of bias correction, you can provide them to save time + + Args: + corrected: the response from the geoglows.bias.correct_historical_simulation function + simulated: the csv response from historic_simulation + observed: the dataframe of observed data. Must have a datetime index and a single column of flow values + merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) + merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Returns: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if corrected is False and simulated is False and observed is False: + if merged_sim_obs is not False and merged_cor_obs is not False: + pass # if you provided the merged dataframes already, we use those + else: + # merge the datasets together + merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) + merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) + + # get the min/max values for plotting the 45 degree line + min_value = min(min(merged_sim_obs.iloc[:, 1].values), min(merged_sim_obs.iloc[:, 0].values)) + max_value = max(max(merged_sim_obs.iloc[:, 1].values), max(merged_sim_obs.iloc[:, 0].values)) + + # do a linear regression on both of the merged dataframes + slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(merged_sim_obs.iloc[:, 0].values, + merged_sim_obs.iloc[:, 1].values) + slope2, intercept2, r_value2, p_value2, std_err2 = scipy.stats.linregress(merged_cor_obs.iloc[:, 0].values, + merged_cor_obs.iloc[:, 1].values) + scatter_sets = [ + go.Scatter( + x=merged_sim_obs.iloc[:, 0].values, + y=merged_sim_obs.iloc[:, 1].values, + mode='markers', + name='Original Data', + marker=dict(color='#ef553b') + ), + go.Scatter( + x=merged_cor_obs.iloc[:, 0].values, + y=merged_cor_obs.iloc[:, 1].values, + mode='markers', + name='Corrected', + marker=dict(color='#00cc96') + ), + go.Scatter( + x=[min_value, max_value], + y=[min_value, max_value], + mode='lines', + name='45 degree line', + line=dict(color='black') + ), + go.Scatter( + x=[min_value, max_value], + y=[slope * min_value + intercept, slope * max_value + intercept], + mode='lines', + name=f'Y = {round(slope, 2)}x + {round(intercept, 2)} (Original)', + line=dict(color='red') + ), + go.Scatter( + x=[min_value, max_value], + y=[slope2 * min_value + intercept2, slope2 * max_value + intercept2], + mode='lines', + name=f'Y = {round(slope2, 2)}x + {round(intercept2, 2)} (Corrected)', + line=dict(color='green') + ) + ] + + updatemenus = [ + dict(active=0, + buttons=[dict(label='Linear Scale', + method='update', + args=[{'visible': [True, True]}, + {'title': 'Linear scale', + 'yaxis': {'type': 'linear'}}]), + dict(label='Log Scale', + method='update', + args=[{'visible': [True, True]}, + {'title': 'Log scale', + 'xaxis': {'type': 'log'}, + 'yaxis': {'type': 'log'}}]), + ] + ) + ] + + layout = go.Layout(title=build_title('Bias Correction Scatter Plot', plot_titles), + xaxis=dict(title='Simulated', ), + yaxis=dict(title='Observed', autorange=True), + showlegend=True, updatemenus=updatemenus) + return go.Figure(data=scatter_sets, layout=layout) + + +def corrected_month_average(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, + merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, + plot_titles: list = None, ) -> go.Figure or str: + """ + Calculates and plots the monthly average streamflow. This function uses + hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full + comparison of bias correction, you can provide them to save time + + Args: + corrected: the response from the geoglows.bias.correct_historical_simulation function + simulated: the csv response from historic_simulation + observed: the dataframe of observed data. Must have a datetime index and a single column of flow values + merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) + merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Returns: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if corrected is False and simulated is False and observed is False: + if merged_sim_obs is not False and merged_cor_obs is not False: + pass # if you provided the merged dataframes already, we use those + else: + # merge the datasets together + merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) + merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) + monthly_avg = hd.monthly_average(merged_sim_obs) + monthly_avg2 = hd.monthly_average(merged_cor_obs) + + scatters = [ + go.Scatter(x=monthly_avg.index, y=monthly_avg.iloc[:, 1].values, name='Observed Data'), + go.Scatter(x=monthly_avg.index, y=monthly_avg.iloc[:, 0].values, name='Simulated Data'), + go.Scatter(x=monthly_avg2.index, y=monthly_avg2.iloc[:, 0].values, name='Corrected Simulated Data'), + ] + + layout = go.Layout( + title=build_title('Monthly Average Streamflow Comparison', plot_titles=plot_titles), + xaxis=dict(title='Month'), yaxis=dict(title='Discharge (m3/s)', autorange=True), + showlegend=True) + + return go.Figure(data=scatters, layout=layout) + + +def corrected_day_average(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, + merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, + titles: dict = None, ) -> go.Figure or str: + """ + Calculates and plots the daily average streamflow. This function uses + hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full + comparison of bias correction, you can provide them to save time + + Args: + corrected: the response from the geoglows.bias.correct_historical_simulation function + simulated: the csv response from historic_simulation + merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) + merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) + observed: the dataframe of observed data. Must have a datetime index and a single column of flow values + titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Returns: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if corrected is False and simulated is False and observed is False: + if merged_sim_obs is not False and merged_cor_obs is not False: + pass # if you provided the merged dataframes already, we use those + else: + # merge the datasets together + merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) + merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) + daily_avg = hd.daily_average(merged_sim_obs) + daily_avg2 = hd.daily_average(merged_cor_obs) + + scatters = [ + go.Scatter(x=daily_avg.index, y=daily_avg.iloc[:, 1].values, name='Observed Data'), + go.Scatter(x=daily_avg.index, y=daily_avg.iloc[:, 0].values, name='Simulated Data'), + go.Scatter(x=daily_avg2.index, y=daily_avg2.iloc[:, 0].values, name='Corrected Simulated Data'), + ] + + layout = go.Layout( + title=build_title('Daily Average Streamflow Comparison', titles), + xaxis=dict(title='Days'), yaxis=dict(title='Discharge (m3/s)', autorange=True), + showlegend=True) + + return go.Figure(data=scatters, layout=layout) + + +def corrected_volume_compare(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, + merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, + plot_titles: dict = None, ) -> go.Figure or str: + """ + Calculates and plots the cumulative volume output on each of the 3 datasets provided. This function uses + hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full + comparison of bias correction, you can provide them to save time + + Args: + corrected: the response from the geoglows.bias.correct_historical_simulation function + simulated: the csv response from historic_simulation + observed: the dataframe of observed data. Must have a datetime index and a single column of flow values + merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) + merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Returns: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if corrected is False and simulated is False and observed is False: + if merged_sim_obs is not False and merged_cor_obs is not False: + pass # if you provided the merged dataframes already, we use those + else: + # merge the datasets together + merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) + merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) + + sim_array = merged_sim_obs.iloc[:, 0].values + obs_array = merged_sim_obs.iloc[:, 1].values + corr_array = merged_cor_obs.iloc[:, 0].values + + sim_volume_dt = sim_array * 0.0864 + obs_volume_dt = obs_array * 0.0864 + corr_volume_dt = corr_array * 0.0864 + + sim_volume_cum = [] + obs_volume_cum = [] + corr_volume_cum = [] + sum_sim = 0 + sum_obs = 0 + sum_corr = 0 + + for i in sim_volume_dt: + sum_sim = sum_sim + i + sim_volume_cum.append(sum_sim) + + for j in obs_volume_dt: + sum_obs = sum_obs + j + obs_volume_cum.append(sum_obs) + + for k in corr_volume_dt: + sum_corr = sum_corr + k + corr_volume_cum.append(sum_corr) + + observed_volume = go.Scatter(x=merged_sim_obs.index, y=obs_volume_cum, name='Observed', ) + simulated_volume = go.Scatter(x=merged_sim_obs.index, y=sim_volume_cum, name='Simulated', ) + corrected_volume = go.Scatter(x=merged_cor_obs.index, y=corr_volume_cum, name='Corrected Simulated', ) + + layout = go.Layout( + title=build_title('Cumulative Volume Comparison', plot_titles=plot_titles), + xaxis=dict(title='Datetime', ), yaxis=dict(title='Volume (m3)', autorange=True), + showlegend=True) + + return go.Figure(data=[observed_volume, simulated_volume, corrected_volume], layout=layout) diff --git a/geoglows/_plots/plotly_forecasts.py b/geoglows/_plots/plotly_forecasts.py new file mode 100644 index 0000000..464ab23 --- /dev/null +++ b/geoglows/_plots/plotly_forecasts.py @@ -0,0 +1,302 @@ +import numpy as np +import pandas as pd +import plotly.graph_objects as go + +from .format_tools import build_title +from .plotly_helpers import _rperiod_scatters + +__all__ = [ + 'forecast', + 'forecast_stats', + 'forecast_ensembles', + 'forecast_records', +] + + +def forecast(df: pd.DataFrame, *, + rp_df: pd.DataFrame = None, + plot_titles: list = None, ) -> go.Figure: + """ + Plots forecasted streamflow and optional return periods + Args: + df: the dataframe response from geoglows.data.forecast + rp_df: optional dataframe of return period data + plot_titles: optional list of strings to include in the figure title. each list item will be on a new line. + + Returns: + go.Figure + """ + scatter_traces = [ + go.Scatter( + x=df.index, + y=df['flow_median'], + name='Streamflow (Median)', + line=dict(color='black'), + ), + go.Scatter( + name='Uncertainty Bounds', + x=np.concatenate([df.index.values, df.index.values[::-1]]), + y=np.concatenate([df['flow_uncertainty_upper'], df['flow_uncertainty_lower'][::-1]]), + legendgroup='uncertainty', + showlegend=True, + fill='toself', + line=dict(color='lightblue', dash=None) + ), + go.Scatter( + name='Uncertainty Upper Bounds (80%)', + x=df.index, + y=df['flow_uncertainty_upper'], + legendgroup='uncertainty', + showlegend=False, + line=dict(color='lightblue', dash='dash') + ), + go.Scatter( + name='Uncertainty Lower Bounds (20%)', + x=df.index, + y=df['flow_uncertainty_lower'], + legendgroup='uncertainty', + showlegend=False, + line=dict(color='lightblue', dash='dash') + ), + ] + + if rp_df is not None: + rperiod_scatters = _rperiod_scatters(df.index[0], df.index[-1], rp_df, df['flow_uncertainty_upper'].max()) + scatter_traces += rperiod_scatters + + layout = go.Layout( + title=build_title('Forecasted Streamflow', plot_titles), + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + xaxis={'title': 'Date (UTC +0:00)', 'range': [df.index[0], df.index[-1]]}, + ) + + return go.Figure(scatter_traces, layout=layout) + + +def forecast_stats(df: pd.DataFrame, *, + rp_df: pd.DataFrame = None, + plot_titles: list = None, + show_maxmin: bool = False, ) -> go.Figure: + """ + Makes the streamflow data and optional metadata into a plotly plot + + Args: + df: the csv response from forecast_stats + rp_df: the csv response from return_periods + plot_titles: a list of strings to place in the figure title. each list item will be on a new line. + show_maxmin: Choose to show or hide the max/min envelope by default + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + # Start processing the inputs + dates = df.index.tolist() + startdate = dates[0] + enddate = dates[-1] + + plot_data = { + 'x_stats': df['flow_avg'].dropna(axis=0).index.tolist(), + 'x_hires': df['high_res'].dropna(axis=0).index.tolist(), + 'y_max': max(df['flow_max']), + 'flow_max': list(df['flow_max'].dropna(axis=0)), + 'flow_75%': list(df['flow_75p'].dropna(axis=0)), + 'flow_avg': list(df['flow_avg'].dropna(axis=0)), + 'flow_med': list(df['flow_med'].dropna(axis=0)), + 'flow_25%': list(df['flow_25p'].dropna(axis=0)), + 'flow_min': list(df['flow_min'].dropna(axis=0)), + 'high_res': list(df['high_res'].dropna(axis=0)), + } + + maxmin_visible = True if show_maxmin else 'legendonly' + scatter_plots = [ + # Plot together so you can use fill='toself' for the shaded box, also separately so the labels appear + go.Scatter(name='Maximum & Minimum Flow', + x=plot_data['x_stats'] + plot_data['x_stats'][::-1], + y=plot_data['flow_max'] + plot_data['flow_min'][::-1], + legendgroup='boundaries', + fill='toself', + visible=maxmin_visible, + line=dict(color='lightblue', dash='dash')), + go.Scatter(name='Maximum', + x=plot_data['x_stats'], + y=plot_data['flow_max'], + legendgroup='boundaries', + visible=maxmin_visible, + showlegend=False, + line=dict(color='darkblue', dash='dash')), + go.Scatter(name='Minimum', + x=plot_data['x_stats'], + y=plot_data['flow_min'], + legendgroup='boundaries', + visible=maxmin_visible, + showlegend=False, + line=dict(color='darkblue', dash='dash')), + + go.Scatter(name='25-75 Percentile Flow', + x=plot_data['x_stats'] + plot_data['x_stats'][::-1], + y=plot_data['flow_75%'] + plot_data['flow_25%'][::-1], + legendgroup='percentile_flow', + fill='toself', + line=dict(color='lightgreen'), ), + go.Scatter(name='75%', + x=plot_data['x_stats'], + y=plot_data['flow_75%'], + showlegend=False, + legendgroup='percentile_flow', + line=dict(color='green'), ), + go.Scatter(name='25%', + x=plot_data['x_stats'], + y=plot_data['flow_25%'], + showlegend=False, + legendgroup='percentile_flow', + line=dict(color='green'), ), + + go.Scatter(name='Higher Time Step Forecast', + x=plot_data['x_hires'], + y=plot_data['high_res'], + visible=True, + line={'color': 'black'}, ), + + go.Scatter(name='Average Flow', + x=plot_data['x_stats'], + y=plot_data['flow_avg'], + line=dict(color='blue'), ), + go.Scatter(name='Median Flow', + x=plot_data['x_stats'], + y=plot_data['flow_med'], + line=dict(color='red'), ), + ] + if rp_df is not None: + plot_data.update(rp_df.to_dict(orient='index').items()) + max_visible = max(max(plot_data['flow_75%']), max(plot_data['flow_avg']), max(plot_data['high_res'])) + rperiod_scatters = _rperiod_scatters(startdate, enddate, rp_df, plot_data['y_max'], max_visible) + scatter_plots += rperiod_scatters + + layout = go.Layout( + title=build_title('Forecasted Streamflow', plot_titles), + yaxis={ + 'title': 'Streamflow (m3/s)', + 'range': [0, 'auto'] + }, + xaxis={ + 'title': 'Date (UTC +0:00)', + 'range': [startdate, enddate], + 'hoverformat': '%b %d %Y', + 'tickformat': '%b %d %Y' + }, + ) + + return go.Figure(scatter_plots, layout=layout) + + +def forecast_ensembles(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: list = None, ) -> go.Figure: + """ + Makes the streamflow ensemble data and metadata into a plotly plot + + Args: + df: the dataframe response from geoglows.data.forecast_ensembles + rp_df: the csv response from return_periods + plot_titles: a list of strings to place in the figure title. each list item will be on a new line. + + Return: + go.Figure + """ + # variables to determine the maximum flow and hold all the scatter plot lines + max_flows = [] + scatter_plots = [] + + # determine the threshold values for return periods and the start/end dates so we can plot them + dates = df.index.tolist() + startdate = dates[0] + enddate = dates[-1] + + # process the series' components and store them in a dictionary + plot_data = { + 'x_1-51': df['ensemble_01'].dropna(axis=0).index.tolist(), + 'x_52': df['ensemble_52'].dropna(axis=0).index.tolist(), + } + + # add a dictionary entry for each of the ensemble members. the key for each series is the integer ensemble number + for ensemble in df.columns: + plot_data[ensemble] = df[ensemble].dropna(axis=0).tolist() + max_flows.append(max(plot_data[ensemble])) + plot_data['y_max'] = max(max_flows) + + if rp_df is not None: + plot_data.update(rp_df.to_dict(orient='index').items()) + rperiod_scatters = _rperiod_scatters(startdate, enddate, rp_df, plot_data['y_max']) + else: + rperiod_scatters = [] + + # create the high resolution line (ensemble 52) + scatter_plots.append(go.Scatter( + name='High Resolution Forecast', + x=plot_data['x_52'], + y=plot_data['ensemble_52'], + line=dict(color='black') + )) + # create a line for the rest of the ensembles (1-51) + for i in range(1, 52): + scatter_plots.append(go.Scatter( + name='Ensemble ' + str(i), + x=plot_data['x_1-51'], + y=plot_data[f'ensemble_{i:02}'], + legendgroup='Ensemble Members' + )) + scatter_plots += rperiod_scatters + + # define a layout for the plot + layout = go.Layout( + title=build_title('Ensemble Predicted Streamflow', plot_titles), + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + xaxis={ + 'title': 'Date (UTC +0:00)', + 'range': [startdate, enddate], + 'hoverformat': '%b %d %Y', + 'tickformat': '%b %d %Y' + }, + ) + return go.Figure(scatter_plots, layout=layout) + + +def forecast_records(recs: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: list = False, ) -> go.Figure: + """ + Makes the streamflow saved forecast data and metadata into a plotly plot + + Args: + recs: the csv response from forecast_records + rp_df: the csv response from return_periods + plot_titles: a list of strings to place in the figure title. each list item will be on a new line. + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + # Start processing the inputs + dates = recs.index.tolist() + startdate = dates[0] + enddate = dates[-1] + + plot_data = { + 'x_records': dates, + 'recorded_flows': recs.dropna(axis=0).values.flatten(), + 'y_max': max(recs.values), + } + if rp_df is not None: + plot_data.update(rp_df.to_dict(orient='index').items()) + rperiod_scatters = _rperiod_scatters(startdate, enddate, rp_df, plot_data['y_max'], plot_data['y_max']) + else: + rperiod_scatters = [] + + scatter_plots = [go.Scatter( + name='Previous Forecast Average', + x=plot_data['x_records'], + y=plot_data['recorded_flows'], + line=dict(color='gold'), + )] + rperiod_scatters + + layout = go.Layout( + title=build_title('Previous Forecasted Streamflow', plot_titles=plot_titles), + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate]}, + ) + return go.Figure(scatter_plots, layout=layout) diff --git a/geoglows/_plots/plotly_helpers.py b/geoglows/_plots/plotly_helpers.py new file mode 100644 index 0000000..1088656 --- /dev/null +++ b/geoglows/_plots/plotly_helpers.py @@ -0,0 +1,48 @@ +import pandas as pd +import plotly.graph_objects as go + +from .format_tools import return_period_plot_colors + +__all__ = [ + '_rperiod_scatters' +] + + +def _rperiod_scatters(startdate: str, + enddate: str, + rperiods: pd.DataFrame, + y_max: float, + max_plotted_flow: float = 0, + visible: bool = None): + colors = return_period_plot_colors() + x_vals = (startdate, enddate, enddate, startdate) + + r2 = int(rperiods[2].values[0]) + r5 = int(rperiods[5].values[0]) + r10 = int(rperiods[10].values[0]) + r25 = int(rperiods[25].values[0]) + r50 = int(rperiods[50].values[0]) + r100 = int(rperiods[100].values[0]) + rmax = int(max(2 * r100 - r25, y_max)) + + visible = True if max_plotted_flow > r2 else 'legendonly' + + def template(name, y, color, fill='toself'): + return go.Scatter( + name=name, + x=x_vals, + y=y, + legendgroup='returnperiods', + fill=fill, + visible=visible, + line=dict(color=color, width=0)) + + return [ + template('Return Periods', (r2, r2, rmax, rmax), 'rgba(0,0,0,0)', fill='none'), + template(f'2 Year: {r2}', (r2, r2, r5, r5), colors['2 Year']), + template(f'5 Year: {r5}', (r5, r5, r10, r10), colors['5 Year']), + template(f'10 Year: {r10}', (r10, r10, r25, r25), colors['10 Year']), + template(f'25 Year: {r25}', (r25, r25, r50, r50), colors['25 Year']), + template(f'50 Year: {r50}', (r50, r50, r100, r100), colors['50 Year']), + template(f'100 Year: {r100}', (r100, r100, rmax, rmax), colors['100 Year']), + ] diff --git a/geoglows/_plots/plotly_retrospective.py b/geoglows/_plots/plotly_retrospective.py new file mode 100644 index 0000000..b7a8a6b --- /dev/null +++ b/geoglows/_plots/plotly_retrospective.py @@ -0,0 +1,322 @@ +import pandas as pd +import plotly.graph_objs as go +import scipy.stats +from plotly.offline import plot as offline_plot + +from .format_tools import build_title +from .plotly_helpers import _rperiod_scatters + +__all__ = [ + 'retrospective', + 'daily_averages', + 'monthly_averages', + 'annual_averages', + 'daily_variance', + 'flow_duration_curve', +] + + +def retrospective(retro: pd.DataFrame, *, + rp_df: pd.DataFrame = None, plot_titles: dict = None, ) -> go.Figure: + """ + Makes the streamflow ensemble data and metadata into a plotly plot + + Args: + retro: the csv response from historic_simulation + rp_df: the csv response from return_periods + plot_type: either 'json', 'plotly', or 'html' (default plotly) + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + dates = retro.index.tolist() + startdate = dates[0] + enddate = dates[-1] + + plot_data = { + 'x_datetime': dates, + 'y_flow': retro.values.flatten(), + 'y_max': retro.values.max(), + } + if rp_df is not None: + plot_data.update(rp_df.to_dict(orient='index').items()) + rperiod_scatters = _rperiod_scatters(startdate, enddate, rp_df, plot_data['y_max'], plot_data['y_max']) + else: + rperiod_scatters = [] + + scatter_plots = [go.Scatter( + name='Retrospective Simulation', + x=plot_data['x_datetime'], + y=plot_data['y_flow']) + ] + scatter_plots += rperiod_scatters + + layout = go.Layout( + title=build_title('Retrospective Streamflow Simulation', plot_titles), + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + xaxis={ + 'title': 'Date (UTC +0:00)', + 'range': [startdate, enddate], + 'hoverformat': '%b %d %Y', + 'tickformat': '%Y' + }, + ) + return go.Figure(scatter_plots, layout=layout) + + +def daily_averages(dayavg: pd.DataFrame, plot_titles: list = None, plot_type: str = 'plotly') -> go.Figure: + """ + Makes the daily_averages data and metadata into a plotly plot + + Args: + dayavg: the csv response from daily_averages + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + plot_type: either 'plotly', or 'html' (default plotly) + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if plot_type not in ['plotly_scatters', 'plotly', 'html']: + raise ValueError('invalid plot_type specified. pick plotly, plotly_scatters, or html') + + scatter_plots = [ + go.Scatter( + name='Average Daily Flow', + x=dayavg.index.tolist(), + y=dayavg.values.flatten(), + line=dict(color='blue') + ), + ] + if plot_type == 'plotly_scatters': + return scatter_plots + layout = go.Layout( + title=build_title('Daily Average Streamflow (Simulated)', plot_titles), + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'}, + ) + figure = go.Figure(scatter_plots, layout=layout) + if plot_type == 'plotly': + return figure + if plot_type == 'html': + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=False + ) + raise ValueError('Invalid plot_type chosen. Choose plotly, plotly_scatters, or html') + + +def monthly_averages(monavg: pd.DataFrame, titles: dict = None, plot_titles: list = None, plot_type: str = 'plotly') -> go.Figure: + """ + Makes the daily_averages data and metadata into a plotly plot + + Args: + monavg: the csv response from monthly_averages + titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + plot_type: either 'plotly', or 'html' (default plotly) + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if plot_type not in ['plotly_scatters', 'plotly', 'html']: + raise ValueError('invalid plot_type specified. pick plotly, plotly_scatters, or html') + + scatter_plots = [ + go.Scatter( + name='Average Monthly Flow', + x=pd.to_datetime(monavg.index, format='%m').strftime('%B'), + y=monavg.values.flatten(), + line=dict(color='blue') + ), + ] + if plot_type == 'plotly_scatters': + return scatter_plots + layout = go.Layout( + title=build_title('Monthly Average Streamflow (Simulated)', plot_titles=plot_titles), + yaxis={'title': 'Streamflow (m3/s)'}, + xaxis={'title': 'Month'}, + ) + figure = go.Figure(scatter_plots, layout=layout) + if plot_type == 'plotly': + return figure + if plot_type == 'html': + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=False + ) + raise ValueError('Invalid plot_type chosen. Choose plotly, plotly_scatters, or html') + + +def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figure: + """ + Makes the annual_averages data and metadata into a plotly plot + + Args: + df: the csv response from annual_averages + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + scatter_plots = [ + go.Scatter( + name='Average Annual Flow', + x=df.index.tolist(), + y=df.values.flatten(), + line=dict(color='blue') + ), + ] + layout = go.Layout( + title=build_title('Annual Average Streamflow (Simulated)', plot_titles), + yaxis={'title': 'Streamflow (m3/s)'}, + xaxis={'title': 'Year'}, + ) + return go.Figure(scatter_plots, layout=layout) + + +def flow_duration_curve(hist: pd.DataFrame, titles: dict = None, plot_type: str = 'plotly') -> go.Figure: + """ + Makes the streamflow ensemble data and metadata into a plotly plot + + Args: + hist: the csv response from historic_simulation + titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + plot_type: either 'json', 'plotly', or 'html' (default plotly) + + Return: + plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method + """ + if plot_type not in ['json', 'plotly_scatters', 'plotly', 'html']: + raise ValueError('invalid plot_type specified. pick json, plotly, plotly_scatters, or html') + + # process the hist dataframe to create the flow duration curve + sorted_hist = hist.values.flatten() + sorted_hist.sort() + + # ranks data from smallest to largest + ranks = len(sorted_hist) - scipy.stats.rankdata(sorted_hist, method='average') + + # calculate probability of each rank + prob = [(ranks[i] / (len(sorted_hist) + 1)) for i in range(len(sorted_hist))] + + plot_data = { + 'x_probability': prob, + 'y_flow': sorted_hist, + 'y_max': sorted_hist[0], + } + + if plot_type == 'json': + return plot_data + + scatter_plots = [ + go.Scatter( + name='Flow Duration Curve', + x=plot_data['x_probability'], + y=plot_data['y_flow']) + ] + if plot_type == 'plotly_scatters': + return scatter_plots + layout = go.Layout( + title=build_title('Flow Duration Curve', titles), + xaxis={'title': 'Exceedence Probability'}, + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + ) + figure = go.Figure(scatter_plots, layout=layout) + if plot_type == 'plotly': + return figure + if plot_type == 'html': + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=False + ) + raise ValueError('Invalid plot_type chosen. Choose json, plotly, plotly_scatters, or html') + + +def daily_stats(hist: pd.DataFrame, titles: dict = None, plot_type: str = 'plotly') -> go.Figure: + """ + Plots a graph with statistics for each day of year + + Args: + hist: dataframe of values to plot + titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + plot_type: either 'plotly' (python object, default), 'plotly_scatters', or 'html' + + returns: + plot of the graph of the low flows + """ + + stats_df = daily_stats(hist) + + data = [ + go.Scatter( + x=stats_df.index.tolist(), + y=stats_df[column].tolist(), + name=column + ) for column in stats_df.columns + ] + + if plot_type == 'plotly_scatters': + return data + layout = go.Layout( + title=build_title('Daily Average Streamflow (Simulated)', titles), + yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, + xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'}, + ) + figure = go.Figure(data=data, layout=layout) + if plot_type == 'plotly': + return figure + elif plot_type == 'html': + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=False + ) + raise ValueError('Invalid plot_type chosen. Choose plotly or html') + + +def daily_variance(daily_variance: pd.DataFrame, plot_titles: list = None, plot_type: str = 'plotly') -> go.Figure: + """ + A dataframe of daily variances computed by the geoglows.analysis.compute_daily_variance function + + Args: + daily_variance: dataframe of values to plot coming from geoglows.analysis.compute_daily_variance + plot_titles: (dict) Extra info to show on the title of the plot. For example: + {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} + plot_type: either 'plotly' (python object, default), 'plotly_scatters', or 'html' + + returns: + plot of standard deviation and plot of the graph of the low flows + """ + data = [ + go.Scatter( + x=daily_variance['date'], + y=daily_variance['flow_std'], + name="Daily Standard Deviation" + ), + ] + if plot_type == 'plotly_scatters': + return data + figure = go.Figure(data=data, layout=go.Layout(build_title('Daily Flow Standard Deviation', plot_titles))) + if plot_type == 'plotly': + return figure + elif plot_type == 'html': + return offline_plot( + figure, + config={'autosizable': True, 'responsive': True}, + output_type='div', + include_plotlyjs=False + ) + raise ValueError('Invalid plot_type chosen. Choose plotly or html') diff --git a/geoglows/_plots/plots.py b/geoglows/_plots/plots.py new file mode 100644 index 0000000..500626b --- /dev/null +++ b/geoglows/_plots/plots.py @@ -0,0 +1,169 @@ +import pandas as pd +import plotly.graph_objects as go + +from .format_tools import plotly_figure_to_html_plot +from .plotly_forecasts import ( + forecast as plotly_forecast, + forecast_stats as plotly_forecast_stats, + forecast_ensembles as plotly_forecast_ensembles +) +from .plotly_retrospective import ( + retrospective as plotly_retrospective, + daily_averages as plotly_daily_averages, + monthly_averages as plotly_monthly_averages, + annual_averages as plotly_annual_averages +) + + +def forecast(df: pd.DataFrame, *, + plot_type: str = 'plotly', + rp_df: pd.DataFrame = None, + plot_titles: list = None, ) -> go.Figure: + """ + Plots forecasted streamflow and optional return periods + Args: + df: the dataframe response from geoglows.data.forecast + rp_df: optional dataframe of return period data + plot_titles: optional list of strings to include in the figure title. each list item will be on a new line. + + Returns: + go.Figure + """ + if plot_type in ('plotly', 'html'): + figure = plotly_forecast(df, rp_df=rp_df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # elif plot_type == 'matplotlib': + # return mpl_forecast(df, rp_df=rp_df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def forecast_stats(df: pd.DataFrame, *, + plot_type: str = 'plotly', + rp_df: pd.DataFrame = None, + plot_titles: list = None, ) -> go.Figure: + """ + Plots forecasted streamflow and optional return periods + Args: + df: the dataframe response from geoglows.data.forecast_stats + rp_df: optional dataframe of return period data + plot_titles: optional list of strings to include in the figure title. each list item will be on a new line. + + Returns: + go.Figure + """ + if plot_type in ('plotly', 'html'): + figure = plotly_forecast_stats(df, rp_df=rp_df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # elif plot_type in ('matplotlib', 'mpl'): + # return mpl_forecast_stats(df, rp_df=rp_df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def forecast_ensembles(df: pd.DataFrame, *, + plot_type: str = 'plotly', + rp_df: pd.DataFrame = None, + plot_titles: list = None, ) -> go.Figure: + """ + Plots forecasted streamflow and optional return periods + Args: + df: the dataframe response from geoglows.data.forecast_ensembles + rp_df: optional dataframe of return period data + plot_titles: optional list of strings to include in the figure title. each list item will be on a new line. + + Returns: + go.Figure + """ + if plot_type in ('plotly', 'html'): + figure = plotly_forecast_ensembles(df, rp_df=rp_df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # elif plot_type == 'matplotlib': + # return mpl_forecast_ensembles(df, rp_df=rp_df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def retrospective(df: pd.DataFrame, *, + plot_type: str = 'plotly', + rp_df: pd.DataFrame = None, + plot_titles: list = None, ) -> go.Figure: + if plot_type in ('plotly', 'html'): + figure = plotly_retrospective(df, rp_df=rp_df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # if plot_type == 'matplotlib': + # return mpl_retrospective(df, rp_df=rp_df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def daily_averages(df: pd.DataFrame, *, + plot_type: str = 'plotly', + plot_titles: list = None, ) -> go.Figure: + if plot_type in ('plotly', 'html'): + figure = plotly_daily_averages(df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # if plot_type == 'matplotlib': + # return mpl_daily_averages(df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def monthly_averages(df: pd.DataFrame, *, + plot_type: str = 'plotly', + plot_titles: list = None, ) -> go.Figure: + if plot_type in ('plotly', 'html'): + figure = plotly_monthly_averages(df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # if plot_type == 'matplotlib': + # return mpl_monthly_averages(df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def annual_averages(df: pd.DataFrame, *, + plot_type: str = 'plotly', + plot_titles: list = None, ) -> go.Figure: + if plot_type in ('plotly', 'html'): + figure = plotly_annual_averages(df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + # if plot_type == 'matplotlib': + # return mpl_annual_averages(df, plot_titles=plot_titles) + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def daily_variance(df: pd.DataFrame, *, + plot_type: str = 'plotly', + plot_titles: list = None, ) -> go.Figure: + if plot_type == 'plotly': + return + # if plot_type == 'matplotlib': + # return + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + +def flow_duration_curve(df: pd.DataFrame, *, + plot_type: str = 'plotly', + plot_titles: list = None, ) -> go.Figure: + if plot_type == 'plotly': + return + # if plot_type == 'matplotlib': + # return + else: + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') diff --git a/geoglows/analysis.py b/geoglows/analysis.py deleted file mode 100644 index 8c19114..0000000 --- a/geoglows/analysis.py +++ /dev/null @@ -1,179 +0,0 @@ -import math - -import hydrostats.data -import pandas as pd -import numpy as np - -__all__ = ['compute_daily_average', 'compute_daily_variance', 'compute_monthly_average', 'compute_return_periods', - 'compute_anomaly', 'compute_daily_statistics', 'compute_low_return_periods'] - - -def gumbel1(rp: int, xbar: float, std: float) -> float: - """ - Solves the Gumbel Type 1 distribution - Args: - rp: return period (years) - xbar: average of the dataset - std: standard deviation of the dataset - - Returns: - float: solution to gumbel distribution - """ - return round(-math.log(-math.log(1 - (1 / rp))) * std * .7797 + xbar - (.45 * std), 3) - - -def compute_daily_average(hist: pd.DataFrame) -> pd.DataFrame: - """ - Processes the historical simulation data into daily averages, the same as from the DailyAverages data service. - - Args: - hist: the csv response from the HistoricSimulation streamflow data service - - Returns: - pandas DataFrame with an index of "%m/%d" dates for each day of the year and a column labeled 'streamflow_m^3/s' - """ - return hydrostats.data.daily_average(hist, rolling=True) - - -def compute_daily_variance(hist: pd.DataFrame) -> pd.DataFrame: - """ - Processes the historical simulation data into daily variance -> the standard deviation for each day of the year - - Args: - hist: the csv response from the HistoricSimulation streamflow data service (datetime index, 1 column of values) - - Returns: - pandas DataFrame with an index of "%m/%d" dates for each day of the year and a column labeled 'std_dv' - """ - daily_variance = hist.copy() - daily_variance.columns = ['flow_std', ] - daily_variance['day_of_year'] = daily_variance.index.strftime('%j') - daily_variance['date'] = daily_variance.index.strftime('%m/%d') - return daily_variance.groupby('day_of_year').std().join(daily_variance.groupby('day_of_year').first()[['date', ]]) - - -def compute_daily_statistics(hist: pd.DataFrame) -> pd.DataFrame: - """ - Calculates the statistics for a datafame given the day of year - - Args: - hist: historical data to compute daily statistics from - - Return: - pd.DataFrame: dataframe with average, min, 25% values, median, 75% value and max value for each day of year - """ - streamflow_data = hist.copy() - streamflow_data.index = streamflow_data.index.strftime('%m/%d') - streamflow_data.index.name = 'day' - daily_grouped = streamflow_data.groupby('day') - return ( - daily_grouped.mean() - .merge(daily_grouped.min(), left_index=True, right_index=True, suffixes=('_avg', '_min')) - .merge(daily_grouped.quantile(.25), left_index=True, right_index=True) - .merge(daily_grouped.median(), left_index=True, right_index=True, suffixes=('_25%', '_med')) - .merge(daily_grouped.quantile(.75), left_index=True, right_index=True) - .merge(daily_grouped.max(), left_index=True, right_index=True, suffixes=('_75%', '_max')) - ) - - -def compute_monthly_average(hist: pd.DataFrame) -> pd.DataFrame: - """ - Processes the historic simulation data into monthly averages, the same as from the MonthlyAverages data service - - Args: - hist: the csv response from the HistoricSimulation streamflow data service - - Returns: - pandas DataFrame with an index of "%m" values for each month and a column labeled 'streamflow_m^3/s' - """ - return hydrostats.data.monthly_average(hist) - - -def compute_return_periods(hist: pd.DataFrame, rps: tuple = (2, 5, 10, 25, 50, 100)) -> dict: - """ - Solves the Gumbel Type-I distribution using the annual maximum flow from the historic simulation - - Args: - hist: the csv response from the HistoricSimulation streamflow data service - rps: a tuple of integer return period numbers to compute - - Returns: - dictionary with keys labeled f'{return_period}_year' and float values - """ - annual_max_flow_list = [] - return_periods = {} - - year_min = hist.index.min().year - year_max = hist.index.max().year - - for y in range(year_min, year_max + 1): - annual_max_flow_list.append(hist[hist.index.year == int(y)].max()) - - annual_max_flow_list = np.array(annual_max_flow_list) - xbar = np.mean(annual_max_flow_list) - std = np.std(annual_max_flow_list) - - for rp in rps: - return_periods[f'return_period_{rp}'] = gumbel1(rp, xbar, std) - - return return_periods - - -def compute_low_return_periods(hist: pd.DataFrame, rps: tuple = (2, 5, 10, 25, 50, 100)) -> dict: - """ - Solves the Gumbel Type-I distribution using the annual minimum flow from the historic simulation - - Args: - hist: the csv response from the HistoricSimulation streamflow data service - rps: a tuple of integer return period numbers to compute - - Returns: - dictionary with keys labeled f'{return_period}_year' and float values - """ - annual_min_flow_list = [] - low_flows = {} - - year_min = hist.index.min().year - year_max = hist.index.max().year - - for y in range(year_min, year_max + 1): - annual_min_flow_list.append(hist[hist.index.year == int(y)].min()) - - annual_min_flow_list = np.array(annual_min_flow_list) - xbar = np.mean(annual_min_flow_list) - std = np.std(annual_min_flow_list) - - for rp in rps: - value = gumbel1(rp, xbar, std) - value = value if value > 0 else 0 - low_flows[f'return_period_{rp}'] = value - - return low_flows - - -def compute_anomaly(stats: pd.DataFrame, day_avgs: pd.DataFrame, daily: bool = True) -> pd.DataFrame: - """ - Compute the anomaly between the average forecasted flow and the daily average - - Args: - stats: the csv response from the ForecastStats data service - day_avgs: the csv response from the DailyAverages data service - daily: if true, aggregate the hourly forecast to a daily average before computing the anomaly - - Returns: - pandas DataFrame with a datetime index and a column labeled 'anomaly_m^3/s' - """ - anomaly_df = pd.DataFrame(stats['flow_avg_m^3/s'], index=stats.index) - anomaly_df = anomaly_df.dropna() - - if daily: - anomaly_df = anomaly_df.resample('D').mean() - - anomaly_df['datetime'] = anomaly_df.index - anomaly_df.index = anomaly_df.index.strftime("%m/%d") - anomaly_df = anomaly_df.join(day_avgs, how="inner") - anomaly_df['anomaly_m^3/s'] = anomaly_df['flow_avg_m^3/s'] - anomaly_df['streamflow_m^3/s'] - anomaly_df.index = anomaly_df['datetime'] - del anomaly_df['flow_avg_m^3/s'], anomaly_df['datetime'], anomaly_df['streamflow_m^3/s'] - - return anomaly_df diff --git a/geoglows/analyze.py b/geoglows/analyze.py new file mode 100644 index 0000000..6bd9183 --- /dev/null +++ b/geoglows/analyze.py @@ -0,0 +1,255 @@ +import math + +import numpy as np +import pandas as pd + +__all__ = [ + 'gumbel1', + 'simple_forecast', + 'forecast_stats', + 'daily_averages', + 'monthly_averages', + 'annual_averages', + 'daily_stats', + 'daily_variance', + 'daily_flow_anomaly', + 'return_periods', + 'low_return_periods' +] + + +def gumbel1(rp: int, xbar: float, std: float) -> float: + """ + Solves the Gumbel Type 1 distribution + Args: + rp: return period (years) + xbar: average of the dataset + std: standard deviation of the dataset + + Returns: + float: solution to gumbel distribution + """ + return round(-math.log(-math.log(1 - (1 / rp))) * std * .7797 + xbar - (.45 * std), 2) + + +def simple_forecast(ens: pd.DataFrame) -> pd.DataFrame: + """ + Calculates the simple forecast from a dataframe of forecast ensembles + + Args: + ens: a dataframe of forecast ensembles + + Returns: + pandas DataFrame with datetime index and columns flow_uncertainty_upper, flow_median, flow_uncertainty_lower + """ + df = ( + ens + .drop(columns=['ensemble_52']) + .dropna() + ) + df = pd.DataFrame({ + f'flow_uncertainty_upper': np.nanpercentile(df.values, 80, axis=1), + f'flow_median': np.median(df.values, axis=1), + f'flow_uncertainty_lower': np.nanpercentile(df.values, 20, axis=1), + }, index=df.index) + return df + + +def forecast_stats(ens: pd.DataFrame) -> pd.DataFrame: + """ + Calculates the statistics for a dataframe of forecast ensembles + + Args: + ens: a dataframe of forecast ensembles + + Returns: + pandas DataFrame with an index of datetime and columns min, max, mean, median, 25%, 75% + """ + df = ( + ens + .drop(columns=['ensemble_52']) + .dropna() + ) + return ( + pd + .DataFrame( + { + 'flow_min': df.min(axis=1), + 'flow_25p': df.quantile(.25, axis=1), + 'flow_avg': df.mean(axis=1), + 'flow_med': df.median(axis=1), + 'flow_75p': df.quantile(.75, axis=1), + 'flow_max': df.max(axis=1), + }, + index=df.index + ) + .merge( + ens[['ensemble_52']] + .rename(columns={'ensemble_52': 'high_res'}), + left_index=True, + right_index=True, + how='outer' + ) + .sort_index(ascending=True) + ) + + +def daily_averages(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculates the daily average of a dataframe with a datetime index + + Args: + df: a dataframe of retrospective simulation data + + Returns: + pandas DataFrame with an index of "%m/%d" dates for each day of the year + """ + return df.groupby(df.index.strftime('%m/%d')).mean() + + +def monthly_averages(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculates the monthly average of a dataframe with a datetime index + + Args: + df: a dataframe of retrospective simulation data + + Returns: + pandas DataFrame with an index of "%m" values for each month + """ + return df.groupby(df.index.strftime('%m')).mean() + + +def annual_averages(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculates the annual average of a dataframe with a datetime index + + Args: + df: a dataframe of retrospective simulation data + + Returns: + pandas DataFrame with an index of "%Y" values for each year + """ + # select years with >= 365 entries + df = df.groupby(df.index.strftime('%Y')).filter(lambda x: len(x) >= 365) + return df.groupby(df.index.strftime('%Y')).mean() + + +def daily_variance(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculate the daily standard deviation of a dataframe with a datetime index + + Args: + df: a dataframe of retrospective simulation data + + Returns: + pandas DataFrame with an index of "%m/%d" dates for each day of the year + """ + return df.groupby(df.index.strftime('%m/%d')).std() + + +def daily_stats(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculates the statistics for a datafame given the day of year + + Args: + df: historical data to compute daily statistics from + + Return: + pd.DataFrame: dataframe with average, min, 25% values, median, 75% value and max value for each day of year + """ + daily_grouped = df.groupby(df.index.strftime('%m/%d')) + return ( + daily_grouped + .mean() + .merge(daily_grouped.min(), left_index=True, right_index=True, suffixes=('_avg', '_min')) + .merge(daily_grouped.quantile(.25), left_index=True, right_index=True) + .merge(daily_grouped.median(), left_index=True, right_index=True, suffixes=('_25%', '_med')) + .merge(daily_grouped.quantile(.75), left_index=True, right_index=True) + .merge(daily_grouped.max(), left_index=True, right_index=True, suffixes=('_75%', '_max')) + ) + + +def daily_flow_anomaly(stats: pd.DataFrame, day_avgs: pd.DataFrame, daily: bool = True) -> pd.DataFrame: + """ + Compute the anomaly between the average forecasted flow and the daily average + + Args: + stats: the csv response from the ForecastStats data service + day_avgs: the csv response from the DailyAverages data service + daily: if true, aggregate the hourly forecast to a daily average before computing the anomaly + + Returns: + pandas DataFrame with a datetime index and a column labeled 'anomaly_m^3/s' + """ + anomaly_df = pd.DataFrame(stats['flow_avg_m^3/s'], index=stats.index) + anomaly_df = anomaly_df.dropna() + + if daily: + anomaly_df = anomaly_df.resample('D').mean() + + anomaly_df['datetime'] = anomaly_df.index + anomaly_df.index = anomaly_df.index.strftime("%m/%d") + anomaly_df = anomaly_df.join(day_avgs, how="inner") + anomaly_df['anomaly_m^3/s'] = anomaly_df['flow_avg_m^3/s'] - anomaly_df['streamflow_m^3/s'] + anomaly_df.index = anomaly_df['datetime'] + del anomaly_df['flow_avg_m^3/s'], anomaly_df['datetime'], anomaly_df['streamflow_m^3/s'] + + return anomaly_df + + +def return_periods(df: pd.DataFrame, rps: int or tuple = (2, 5, 10, 25, 50, 100)) -> dict: + """ + Solves the Gumbel Type-I distribution using the annual maximum flow from the historic simulation + + Args: + df: a dataframe of retrospective simulation data + rps: an integer or iterable of integer return period numbers to compute + + Returns: + dict with keys 'max_simulated' and 'return_period_{year}' for each year with float values to 2 decimals + """ + annual_max_flow_list = df.groupby(df.index.strftime('%Y')).max().values + xbar = np.mean(annual_max_flow_list) + std = np.std(annual_max_flow_list) + + if type(rps) is int: + rps = (rps,) + + ret_pers = { + 'max_simulated': round(np.max(annual_max_flow_list), 2) + } + ret_pers.update({f'return_period_{rp}': round(gumbel1(rp, xbar, std), 2) for rp in rps}) + return ret_pers + + +def low_return_periods(hist: pd.DataFrame, rps: tuple = (2, 5, 10, 25, 50, 100)) -> dict: + """ + Solves the Gumbel Type-I distribution using the annual minimum flow from the historic simulation + + Args: + hist: the csv response from the HistoricSimulation streamflow data service + rps: a tuple of integer return period numbers to compute + + Returns: + dictionary with keys labeled f'{return_period}_year' and float values + """ + annual_min_flow_list = [] + low_flows = {} + + year_min = hist.index.min().year + year_max = hist.index.max().year + + for y in range(year_min, year_max + 1): + annual_min_flow_list.append(hist[hist.index.year == int(y)].min()) + + annual_min_flow_list = np.array(annual_min_flow_list) + xbar = np.mean(annual_min_flow_list) + std = np.std(annual_min_flow_list) + + for rp in rps: + value = gumbel1(rp, xbar, std) + value = value if value > 0 else 0 + low_flows[f'return_period_{rp}'] = value + + return low_flows diff --git a/geoglows/bias.py b/geoglows/bias.py index 272bf35..f3d2d56 100644 --- a/geoglows/bias.py +++ b/geoglows/bias.py @@ -1,13 +1,17 @@ import math +import warnings import hydrostats as hs import hydrostats.data as hd import numpy as np import pandas as pd from scipy import interpolate -import warnings -__all__ = ['correct_historical', 'correct_forecast', 'statistics_tables'] +__all__ = [ + 'correct_historical', + 'correct_forecast', + 'statistics_tables', +] def correct_historical(simulated_data: pd.DataFrame, observed_data: pd.DataFrame) -> pd.DataFrame: diff --git a/geoglows/data.py b/geoglows/data.py new file mode 100644 index 0000000..977fcef --- /dev/null +++ b/geoglows/data.py @@ -0,0 +1,356 @@ +import os +import warnings +from io import StringIO + +import pandas as pd +import requests +import s3fs +import xarray as xr + +from ._constants import METADATA_TABLE_LOCAL_PATH +from .analyze import ( + simple_forecast as calc_simple_forecast, + forecast_stats as calc_forecast_stats, + daily_averages as calc_daily_averages, + monthly_averages as calc_monthly_averages, + annual_averages as calc_annual_averages, +) + +__all__ = [ + 'dates', + 'forecast', + 'forecast_stats', + 'forecast_ensembles', + 'forecast_records', + + 'retrospective', + 'daily_averages', + 'monthly_averages', + 'annual_averages', + 'return_periods', + + 'metadata_tables', +] + +DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/' +DEFAULT_REST_ENDPOINT_VERSION = 'v2' # 'v1, v2, latest' +ODP_CORE_S3_BUCKET_URI = 's3://geoglows-v2' +ODP_FORECAST_S3_BUCKET_URI = 's3://geoglows-v2-forecasts' +ODP_RETROSPECTIVE_S3_BUCKET_URI = 's3://geoglows-v2-retrospective' +ODP_S3_BUCKET_REGION = 'us-west-2' + + +def _forecast_endpoint_decorator(function): + def from_aws(*args, **kwargs): + product_name = function.__name__.replace("_", "").lower() + if product_name == 'forecastrecords': + warnings.warn('forecast_records are not available from the AWS Open Data Program.') + return from_rest(*args, **kwargs) + + reach_id = args[0] if len(args) > 0 else None + reach_id = kwargs.get('reach_id', '') if not reach_id else reach_id + + s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) + if kwargs.get('date', '') and not product_name == 'dates': + date = kwargs['date'] + if len(date) == 8: + date = f'{date}00.zarr' + elif len(date) == 10: + date = f'{date}.zarr' + else: + raise ValueError('Date must be YYYYMMDD or YYYYMMDDHH format. Use dates() to view available data.') + else: + dates = sorted([x.split('/')[-1] for x in s3.ls(ODP_FORECAST_S3_BUCKET_URI)]) + if product_name == 'dates': + return dates + date = dates[-1] + s3store = s3fs.S3Map(root=f'{ODP_FORECAST_S3_BUCKET_URI}/{date}', s3=s3, check=False) + + df = xr.open_zarr(s3store).sel(rivid=reach_id).to_dataframe().round(2).reset_index() + + # rename columns to match the REST API + if type(reach_id) is list: + df = df.pivot(columns='ensemble', index=['time', 'rivid'], values='Qout') + else: + df = df.pivot(index='time', columns='ensemble', values='Qout') + df = df[sorted(df.columns)] + df.columns = [f'ensemble_{str(x).zfill(2)}' for x in df.columns] + + if product_name == 'forecastensembles': + return df + elif product_name == 'forecaststats': + return calc_forecast_stats(df) + elif product_name == 'forecast': + return calc_simple_forecast(df) + return + + def from_rest(*args, **kwargs): + # update the default values set by the function unless the user has already specified them + for key, value in function.__kwdefaults__.items() if function.__kwdefaults__ else []: + if key not in kwargs: + kwargs[key] = value + + return_url = kwargs.get('format', 'csv') == 'url' + + # parse out the information necessary to build a request url + endpoint = kwargs.get('endpoint', DEFAULT_REST_ENDPOINT) + endpoint = endpoint[:-1] if endpoint[-1] == '/' else endpoint + endpoint = endpoint + '/api' if not endpoint.endswith('/api') else endpoint + endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint + + version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION) + + product_name = function.__name__.replace("_", "").lower() + + reach_id = args[0] if len(args) > 0 else None + reach_id = kwargs.get('reach_id', '') if not reach_id else reach_id + + return_format = kwargs.get('return_format', 'csv') + assert return_format in ('csv', 'json', 'url'), f'Unsupported return format requested: {return_format}' + + # request parameter validation before submitting + for key in ('endpoint', 'version', 'reach_id'): + if key in kwargs: + del kwargs[key] + for key, value in kwargs.items(): + if value is None: + del kwargs[key] + for date in ('date', 'start_date', 'end_date'): + if date in kwargs: + assert len(str(kwargs[date])) == 8 or len( + str(kwargs[date])) == 10, f'Invalid date format: {kwargs[date]}' + if 'format' in kwargs and kwargs['format'] != 'json': + del kwargs['format'] + kwargs['source'] = kwargs.get('source', 'pygeoglows') # allow using default for specific apps which override + params = '&'.join([f'{key}={value}' for key, value in kwargs.items()]) + + # piece together the request url + request_url = f'{endpoint}/{version}/{product_name}' # build the base url + request_url = f'{request_url}/{reach_id}' if reach_id else request_url # add the reach_id if it exists + request_url = f'{request_url}?{params}' # add the query parameters + + if return_url: + return request_url.replace(f'source={kwargs["source"]}', '') + + response = requests.get(request_url) + + if response.status_code != 200: + raise RuntimeError('Received an error from the REST API: ' + response.text) + + if return_format == 'csv': + df = pd.read_csv(StringIO(response.text)) + if 'datetime' in df.columns: + df['datetime'] = pd.to_datetime(df['datetime']) + df = df.set_index('datetime') + return df + elif return_format == 'json': + return response.json() + else: + raise ValueError(f'Unsupported return format requested: {return_format}') + + def main(*args, **kwargs): + source = kwargs.get('data_source', 'rest') + assert source in ('rest', 'aws'), ValueError(f'Unrecognized data source requested: {source}') + if source == 'rest': + return from_rest(*args, **kwargs) + else: + return from_aws(*args, **kwargs) + + return main + + +# Forecast data and derived products +@_forecast_endpoint_decorator +def dates(**kwargs) -> dict or str: + """ + Gets a list of available forecast product dates + + Keyword Args: + return_format: csv, json, or url, default csv + data_source: location to query for data, either 'rest' or 'aws'. default is aws. + + Returns: + dict or str + + the csv is a single column with a header of 'available_dates' and 1 row per date, sorted oldest to newest + The dictionary structure is {'available_dates': ['list', 'of', 'dates', 'YYYYMMDD', 'format']} + """ + pass + + +@_forecast_endpoint_decorator +def forecast(*, reach_id: int, date: str, return_format: str, data_source: str, + **kwargs) -> pd.DataFrame or dict or str: + """ + Gets the average forecasted flow for a certain reach_id on a certain date + + Keyword Args: + reach_id: the ID of a stream, should be a 9 digit integer + date: a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified + return_format: csv, json, or url, default csv + data_source: location to query for data, either 'rest' or 'aws'. default is aws. + + Returns: + pd.DataFrame or dict or str + """ + pass + + +@_forecast_endpoint_decorator +def forecast_stats(*, reach_id: int, date: str, return_format: str, data_source: str, + **kwargs) -> pd.DataFrame or dict or str: + """ + Retrieves the min, 25%, mean, median, 75%, and max river discharge of the 51 ensembles members for a reach_id + The 52nd higher resolution member is excluded + + Keyword Args: + reach_id: the ID of a stream, should be a 9 digit integer + date: a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified + return_format: csv, json, or url, default csv + data_source: location to query for data, either 'rest' or 'aws'. default is aws. + + Returns: + pd.DataFrame or dict or str + """ + pass + + +@_forecast_endpoint_decorator +def forecast_ensembles(*, reach_id: int, date: str, return_format: str, data_source: str, + **kwargs) -> pd.DataFrame or dict or str: + """ + Retrieves each of 52 time series of forecasted discharge for a reach_id on a certain date + + Keyword Args: + reach_id: the ID of a stream, should be a 9 digit integer + date: a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified + return_format: csv, json, or url, default csv + data_source: location to query for data, either 'rest' or 'aws'. default is aws. + + Returns: + pd.DataFrame or dict or str + """ + pass + + +@_forecast_endpoint_decorator +def forecast_records(*, reach_id: int, start_date: str, end_date: str, return_format: str, data_source: str, + **kwargs) -> pd.DataFrame or dict or str: + """ + Retrieves a csv showing the ensemble average forecasted flow for the year from January 1 to the current date + + Keyword Args: + reach_id: the ID of a stream, should be a 9 digit integer + start_date: a YYYYMMDD string giving the earliest date this year to include, defaults to 14 days ago. + end_date: a YYYYMMDD string giving the latest date this year to include, defaults to latest available + data_source: location to query for data, either 'rest' or 'aws'. default is aws. + return_format: csv, json, or url, default csv + + Returns: + pd.DataFrame or dict or str + """ + pass + + +# Retrospective simulation and derived products +def retrospective(reach_id: int or list) -> pd.DataFrame: + """ + Retrieves the retrospective simulation of streamflow for a given reach_id from the + AWS Open Data Program GEOGloWS V2 S3 bucket + + Args: + reach_id: the ID of a stream, should be a 9 digit integer + + Returns: + pd.DataFrame + """ + s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) + s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False) + return (xr.open_zarr(s3store).sel(rivid=reach_id).to_dataframe().reset_index().set_index('time') + .pivot(columns='rivid', values='Qout')) + + +def historical(*args, **kwargs): + """Alias for retrospective""" + return retrospective(*args, **kwargs) + + +def daily_averages(reach_id: int or list) -> pd.DataFrame: + """ + Retrieves daily average streamflow for a given reach_id + + Args: + reach_id: the ID of a stream, should be a 9 digit integer + + Returns: + pd.DataFrame + """ + df = retrospective(reach_id) + return calc_daily_averages(df) + + +def monthly_averages(reach_id: int or list) -> pd.DataFrame: + """ + Retrieves monthly average streamflow for a given reach_id + + Args: + reach_id: the ID of a stream, should be a 9 digit integer + + Returns: + pd.DataFrame + """ + df = retrospective(reach_id) + return calc_monthly_averages(df) + + +def annual_averages(reach_id: int or list) -> pd.DataFrame: + """ + Retrieves annual average streamflow for a given reach_id + + Args: + reach_id: the ID of a stream, should be a 9 digit integer + + Returns: + pd.DataFrame + """ + df = retrospective(reach_id) + return calc_annual_averages(df) + + +def return_periods(reach_id: int or list) -> pd.DataFrame: + """ + Retrieves the return period thresholds based on a specified historic simulation forcing on a certain reach_id. + + Args: + reach_id: the ID of a stream, should be a 9 digit integer + + Returns: + pd.DataFrame + """ + s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) + s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False) + return (xr.open_zarr(s3store).sel(rivid=reach_id)['return_period_flow'].to_dataframe().reset_index() + .pivot(index='rivid', columns='return_period', values='return_period_flow')) + + +# model config and supplementary data +def metadata_tables(columns: list = None) -> pd.DataFrame: + """ + Retrieves the master table of stream reaches with all metadata and properties as a pandas DataFrame + Args: + columns: optional subset of columns names to read from the parquet + + Returns: + pd.DataFrame + """ + if os.path.exists(METADATA_TABLE_LOCAL_PATH): + return pd.read_parquet(METADATA_TABLE_LOCAL_PATH, columns=columns) + warn = f""" + Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and + to make the data available when you are offline. A copy of the table will be cached at {METADATA_TABLE_LOCAL_PATH}. + """ + warnings.warn(warn) + df = pd.read_parquet('https://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet') + os.makedirs(os.path.dirname(METADATA_TABLE_LOCAL_PATH), exist_ok=True) + df.to_parquet(METADATA_TABLE_LOCAL_PATH) + return df[columns] if columns else df diff --git a/geoglows/data/ens_example.pickle b/geoglows/data/ens_example.pickle deleted file mode 100644 index dc3a870..0000000 Binary files a/geoglows/data/ens_example.pickle and /dev/null differ diff --git a/geoglows/data/hist_example.pickle b/geoglows/data/hist_example.pickle deleted file mode 100644 index 9139ec8..0000000 Binary files a/geoglows/data/hist_example.pickle and /dev/null differ diff --git a/geoglows/data/rec_example.pickle b/geoglows/data/rec_example.pickle deleted file mode 100644 index 254669f..0000000 Binary files a/geoglows/data/rec_example.pickle and /dev/null differ diff --git a/geoglows/data/rper_example.pickle b/geoglows/data/rper_example.pickle deleted file mode 100644 index d3f1ca7..0000000 Binary files a/geoglows/data/rper_example.pickle and /dev/null differ diff --git a/geoglows/data/stats_example.pickle b/geoglows/data/stats_example.pickle deleted file mode 100644 index fa1d1e1..0000000 Binary files a/geoglows/data/stats_example.pickle and /dev/null differ diff --git a/geoglows/examples.py b/geoglows/examples.py deleted file mode 100644 index 1661b8b..0000000 --- a/geoglows/examples.py +++ /dev/null @@ -1,37 +0,0 @@ -import os -import pandas as pd - - -def stats(): - """ - Returns a pandas dataframe of forecast stats data which provides a good visualization - """ - return pd.read_pickle(os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'stats_example.pickle'))) - - -def ens(): - """ - Returns a pandas dataframe of forecast ensembles data which provides a good visualization - """ - return pd.read_pickle(os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'ens_example.pickle'))) - - -def hist(): - """ - Returns a pandas dataframe of historical simulation data which provides a good visualization - """ - return pd.read_pickle(os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'hist_example.pickle'))) - - -def rec(): - """ - Returns a pandas dataframe of forecast records data which provides a good visualization - """ - return pd.read_pickle(os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'rec_example.pickle'))) - - -def rper(): - """ - Returns a pandas dataframe of return periods data which provides a good visualization - """ - return pd.read_pickle(os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'rper_example.pickle'))) diff --git a/geoglows/geometry/africa-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/africa-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 53c8167..0000000 Binary files a/geoglows/geometry/africa-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/australia-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/australia-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 2cae859..0000000 Binary files a/geoglows/geometry/australia-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/boundaries.pickle b/geoglows/geometry/boundaries.pickle deleted file mode 100644 index 9a889ca..0000000 Binary files a/geoglows/geometry/boundaries.pickle and /dev/null differ diff --git a/geoglows/geometry/central_america-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/central_america-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 308af50..0000000 Binary files a/geoglows/geometry/central_america-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/central_asia-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/central_asia-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index d4ec6ea..0000000 Binary files a/geoglows/geometry/central_asia-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/east_asia-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/east_asia-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 811ec63..0000000 Binary files a/geoglows/geometry/east_asia-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/europe-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/europe-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 32c4acb..0000000 Binary files a/geoglows/geometry/europe-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/islands-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/islands-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 6d2c283..0000000 Binary files a/geoglows/geometry/islands-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/japan-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/japan-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 822ad8b..0000000 Binary files a/geoglows/geometry/japan-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/middle_east-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/middle_east-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 95cc3bb..0000000 Binary files a/geoglows/geometry/middle_east-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/north_america-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/north_america-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index ce6fc9f..0000000 Binary files a/geoglows/geometry/north_america-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/south_america-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/south_america-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 69ec08d..0000000 Binary files a/geoglows/geometry/south_america-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/south_asia-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/south_asia-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 23cba3b..0000000 Binary files a/geoglows/geometry/south_asia-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/geometry/west_asia-geoglows-comid_lat_lon_z.pickle b/geoglows/geometry/west_asia-geoglows-comid_lat_lon_z.pickle deleted file mode 100644 index 44f311e..0000000 Binary files a/geoglows/geometry/west_asia-geoglows-comid_lat_lon_z.pickle and /dev/null differ diff --git a/geoglows/plots.py b/geoglows/plots.py deleted file mode 100644 index 8cb0e71..0000000 --- a/geoglows/plots.py +++ /dev/null @@ -1,1252 +0,0 @@ -import datetime -import json -import os - -import hydrostats.data as hd -import jinja2 -import pandas as pd -import plotly.graph_objs as go -import scipy.stats -from plotly.offline import plot as offline_plot - -from .analysis import compute_daily_statistics - -__all__ = ['hydroviewer', 'forecast_stats', 'forecast_records', 'forecast_ensembles', 'daily_variance', - 'historic_simulation', 'daily_averages', 'monthly_averages', 'flow_duration_curve', 'probabilities_table', - 'return_periods_table', 'corrected_historical', 'corrected_scatterplots', 'corrected_day_average', - 'corrected_month_average', 'corrected_volume_compare', 'daily_stats'] - - -# FUNCTIONS THAT PROCESS THE RESULTS OF THE API INTO A PLOTLY PLOT OR DICTIONARY -def hydroviewer(recs: pd.DataFrame, stats: pd.DataFrame, ensem: pd.DataFrame, rperiods: pd.DataFrame = None, - record_days: int = 7, outformat: str = 'plotly', titles: dict = False, hide_ensem: bool = True, - hide_rperiods: bool = True) -> go.Figure: - """ - Creates the standard plot for a hydroviewer - - Args: - recs: the response from forecast_records - stats: the response from forecast_stats - ensem: the response from forecast_ensembles - rperiods: (optional) the response from return_periods - outformat: (optional) either 'plotly' or 'plotly_html' (default plotly) - record_days: (optional) number of days of forecast records to show before the start of the forecast - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - hide_ensem: Hide the ensemble members in the layout initially - hide_rperiods: Hide the return period labels in the layout initially - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick plotly or plotly_html') - - # determine the bounds of the plot on the x and y axis - stats_dates = stats.index.tolist() - # limit the forecast records to 7 days before the start of the forecast - recs = recs[recs.index >= pd.to_datetime(stats_dates[0] - datetime.timedelta(days=record_days))] - records_dates = recs.index.tolist() - if len(records_dates) == 0: - startdate = stats_dates[0] - enddate = stats_dates[-1] - else: - startdate = min(records_dates[0], stats_dates[0]) - enddate = max(records_dates[-1], stats_dates[-1]) - max_flow = max(recs['streamflow_m^3/s'].max(), stats['flow_max_m^3/s'].max()) - - # start building the plotly graph object - figure = forecast_records(recs, outformat='plotly') - for new_scatter in forecast_stats(stats, outformat='plotly_scatters'): - figure.add_trace(new_scatter) - - # do the ensembles separately so we can group then and make only 1 legend entry - ensemble_data = forecast_ensembles(ensem, outformat='json') - figure.add_trace(go.Scatter( - x=ensemble_data['x_1-51'], - y=ensemble_data['ensemble_01_m^3/s'], - visible='legendonly' if hide_ensem else True, - legendgroup='ensembles', - name='Forecast Ensembles', - )) - for i in range(2, 52): - figure.add_trace(go.Scatter( - x=ensemble_data['x_1-51'], - y=ensemble_data[f'ensemble_{i:02}_m^3/s'], - visible='legendonly' if hide_ensem else True, - legendgroup='ensembles', - name=f'Ensemble {i}', - showlegend=False, - )) - if rperiods is not None: - max_visible = max(stats['flow_75%_m^3/s'].max(), stats['flow_avg_m^3/s'].max(), stats['high_res_m^3/s'].max(), - recs['streamflow_m^3/s'].max()) - for rp in _rperiod_scatters(startdate, enddate, rperiods, max_flow, max_visible, - visible='legendonly' if hide_rperiods else True): - figure.add_trace(rp) - - figure.update_layout( - title=_build_title('Forecasted Streamflow', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate]}, - ) - - if outformat == 'plotly': - return figure - else: # outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - - -def forecast_stats(stats: pd.DataFrame, rperiods: pd.DataFrame = None, titles: dict = False, - outformat: str = 'plotly', hide_maxmin: bool = False) -> go.Figure: - """ - Makes the streamflow data and optional metadata into a plotly plot - - Args: - stats: the csv response from forecast_stats - rperiods: the csv response from return_periods - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - outformat: 'json', 'plotly', 'plotly_scatters', or 'plotly_html' (default plotly) - hide_maxmin: Choose to hide the max/min envelope by default - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['json', 'plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick json, plotly, plotly_scatters, or plotly_html') - - # Start processing the inputs - dates = stats.index.tolist() - startdate = dates[0] - enddate = dates[-1] - - plot_data = { - 'x_stats': stats['flow_avg_m^3/s'].dropna(axis=0).index.tolist(), - 'x_hires': stats['high_res_m^3/s'].dropna(axis=0).index.tolist(), - 'y_max': max(stats['flow_max_m^3/s']), - 'flow_max': list(stats['flow_max_m^3/s'].dropna(axis=0)), - 'flow_75%': list(stats['flow_75%_m^3/s'].dropna(axis=0)), - 'flow_avg': list(stats['flow_avg_m^3/s'].dropna(axis=0)), - 'flow_25%': list(stats['flow_25%_m^3/s'].dropna(axis=0)), - 'flow_min': list(stats['flow_min_m^3/s'].dropna(axis=0)), - 'high_res': list(stats['high_res_m^3/s'].dropna(axis=0)), - } - if rperiods is not None: - plot_data.update(rperiods.to_dict(orient='index').items()) - max_visible = max(max(plot_data['flow_75%']), max(plot_data['flow_avg']), max(plot_data['high_res'])) - rperiod_scatters = _rperiod_scatters(startdate, enddate, rperiods, plot_data['y_max'], max_visible) - else: - rperiod_scatters = [] - if outformat == 'json': - return plot_data - - maxmin_visible = 'legendonly' if hide_maxmin else True - scatter_plots = [ - # Plot together so you can use fill='toself' for the shaded box, also separately so the labels appear - go.Scatter(name='Maximum & Minimum Flow', - x=plot_data['x_stats'] + plot_data['x_stats'][::-1], - y=plot_data['flow_max'] + plot_data['flow_min'][::-1], - legendgroup='boundaries', - fill='toself', - visible=maxmin_visible, - line=dict(color='lightblue', dash='dash')), - go.Scatter(name='Maximum', - x=plot_data['x_stats'], - y=plot_data['flow_max'], - legendgroup='boundaries', - visible=maxmin_visible, - showlegend=False, - line=dict(color='darkblue', dash='dash')), - go.Scatter(name='Minimum', - x=plot_data['x_stats'], - y=plot_data['flow_min'], - legendgroup='boundaries', - visible=maxmin_visible, - showlegend=False, - line=dict(color='darkblue', dash='dash')), - - go.Scatter(name='25-75 Percentile Flow', - x=plot_data['x_stats'] + plot_data['x_stats'][::-1], - y=plot_data['flow_75%'] + plot_data['flow_25%'][::-1], - legendgroup='percentile_flow', - fill='toself', - line=dict(color='lightgreen'), ), - go.Scatter(name='75%', - x=plot_data['x_stats'], - y=plot_data['flow_75%'], - showlegend=False, - legendgroup='percentile_flow', - line=dict(color='green'), ), - go.Scatter(name='25%', - x=plot_data['x_stats'], - y=plot_data['flow_25%'], - showlegend=False, - legendgroup='percentile_flow', - line=dict(color='green'), ), - - go.Scatter(name='High Resolution Forecast', - x=plot_data['x_hires'], - y=plot_data['high_res'], - line={'color': 'black'}, ), - go.Scatter(name='Ensemble Average Flow', - x=plot_data['x_stats'], - y=plot_data['flow_avg'], - line=dict(color='blue'), ), - ] - scatter_plots += rperiod_scatters - - if outformat == 'plotly_scatters': - return scatter_plots - - layout = go.Layout( - title=_build_title('Forecasted Streamflow', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate], 'hoverformat': '%b %d %Y', - 'tickformat': '%b %d %Y'}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - return - - -def forecast_ensembles(ensem: pd.DataFrame, rperiods: pd.DataFrame = None, titles: dict = False, - outformat: str = 'plotly') -> go.Figure: - """ - Makes the streamflow ensemble data and metadata into a plotly plot - - Args: - ensem: the csv response from forecast_ensembles - rperiods: the csv response from return_periods - outformat: either 'json', 'plotly', or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['json', 'plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick json, plotly, plotly_scatters, or plotly_html') - - # variables to determine the maximum flow and hold all the scatter plot lines - max_flows = [] - scatter_plots = [] - - # determine the threshold values for return periods and the start/end dates so we can plot them - dates = ensem.index.tolist() - startdate = dates[0] - enddate = dates[-1] - - # process the series' components and store them in a dictionary - plot_data = { - 'x_1-51': ensem['ensemble_01_m^3/s'].dropna(axis=0).index.tolist(), - 'x_52': ensem['ensemble_52_m^3/s'].dropna(axis=0).index.tolist(), - } - - # add a dictionary entry for each of the ensemble members. the key for each series is the integer ensemble number - for ensemble in ensem.columns: - plot_data[ensemble] = ensem[ensemble].dropna(axis=0).tolist() - max_flows.append(max(plot_data[ensemble])) - plot_data['y_max'] = max(max_flows) - - if rperiods is not None: - plot_data.update(rperiods.to_dict(orient='index').items()) - rperiod_scatters = _rperiod_scatters(startdate, enddate, rperiods, plot_data['y_max']) - else: - rperiod_scatters = [] - if outformat == 'json': - return plot_data - - # create the high resolution line (ensemble 52) - scatter_plots.append(go.Scatter( - name='High Resolution Forecast', - x=plot_data['x_52'], - y=plot_data['ensemble_52_m^3/s'], - line=dict(color='black') - )) - # create a line for the rest of the ensembles (1-51) - for i in range(1, 52): - scatter_plots.append(go.Scatter( - name='Ensemble ' + str(i), - x=plot_data['x_1-51'], - y=plot_data[f'ensemble_{i:02}_m^3/s'], - )) - scatter_plots += rperiod_scatters - - if outformat == 'plotly_scatters': - return scatter_plots - - # define a layout for the plot - layout = go.Layout( - title=_build_title('Ensemble Predicted Streamflow', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate], 'hoverformat': '%b %d %Y', - 'tickformat': '%b %d %Y'}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - return - - -def forecast_records(recs: pd.DataFrame, rperiods: pd.DataFrame = None, titles: dict = False, - outformat: str = 'plotly') -> go.Figure: - """ - Makes the streamflow saved forecast data and metadata into a plotly plot - - Args: - recs: the csv response from forecast_records - rperiods: the csv response from return_periods - outformat: either 'json', 'plotly', or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['json', 'plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick json, plotly, plotly_scatters, or plotly_html') - - # Start processing the inputs - dates = recs.index.tolist() - startdate = dates[0] - enddate = dates[-1] - - plot_data = { - 'x_records': dates, - 'recorded_flows': recs.dropna(axis=0).values.flatten(), - 'y_max': max(recs.values), - } - if rperiods is not None: - plot_data.update(rperiods.to_dict(orient='index').items()) - rperiod_scatters = _rperiod_scatters(startdate, enddate, rperiods, plot_data['y_max'], plot_data['y_max']) - else: - rperiod_scatters = [] - if outformat == 'json': - return plot_data - - scatter_plots = [go.Scatter( - name='Previous Forecast Average', - x=plot_data['x_records'], - y=plot_data['recorded_flows'], - line=dict(color='gold'), - )] + rperiod_scatters - - if outformat == 'plotly_scatters': - return scatter_plots - - layout = go.Layout( - title=_build_title('Forecasted Streamflow Record', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate]}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - return - - -def historic_simulation(hist: pd.DataFrame, rperiods: pd.DataFrame = None, titles: dict = False, - outformat: str = 'plotly') -> go.Figure: - """ - Makes the streamflow ensemble data and metadata into a plotly plot - - Args: - hist: the csv response from historic_simulation - rperiods: the csv response from return_periods - outformat: either 'json', 'plotly', or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['json', 'plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick json, plotly, plotly_scatters, or plotly_html') - - dates = hist.index.tolist() - startdate = dates[0] - enddate = dates[-1] - - plot_data = { - 'x_datetime': dates, - 'y_flow': hist.values.flatten(), - 'y_max': max(hist.values), - } - if rperiods is not None: - plot_data.update(rperiods.to_dict(orient='index').items()) - rperiod_scatters = _rperiod_scatters(startdate, enddate, rperiods, plot_data['y_max'], plot_data['y_max']) - else: - rperiod_scatters = [] - - if outformat == 'json': - return plot_data - - scatter_plots = [go.Scatter( - name='Historical Simulation', - x=plot_data['x_datetime'], - y=plot_data['y_flow']) - ] - scatter_plots += rperiod_scatters - - if outformat == 'plotly_scatters': - return scatter_plots - - layout = go.Layout( - title=_build_title('Historical Streamflow Simulation', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate], 'hoverformat': '%b %d %Y', - 'tickformat': '%Y'}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose json, plotly, plotly_scatters, or plotly_html') - - -def daily_averages(dayavg: pd.DataFrame, titles: dict = False, outformat: str = 'plotly') -> go.Figure: - """ - Makes the daily_averages data and metadata into a plotly plot - - Args: - dayavg: the csv response from daily_averages - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - outformat: either 'plotly', or 'plotly_html' (default plotly) - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick plotly, plotly_scatters, or plotly_html') - - scatter_plots = [ - go.Scatter( - name='Average Daily Flow', - x=dayavg.index.tolist(), - y=dayavg.values.flatten(), - line=dict(color='blue') - ), - ] - if outformat == 'plotly_scatters': - return scatter_plots - layout = go.Layout( - title=_build_title('Daily Average Streamflow (Simulated)', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly, plotly_scatters, or plotly_html') - - -def daily_variance(daily_variance: pd.DataFrame, titles: dict = None, outformat: str = 'plotly') -> go.Figure: - """ - A dataframe of daily variances computed by the geoglows.analysis.compute_daily_variance function - - Args: - daily_variance: dataframe of values to plot coming from geoglows.analysis.compute_daily_variance - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - outformat: either 'plotly' (python object, default), 'plotly_scatters', or 'plotly_html' - - returns: - plot of standard deviation and plot of the graph of the low flows - """ - data = [ - go.Scatter( - x=daily_variance['date'], - y=daily_variance['flow_std'], - name="Daily Standard Deviation" - ) - ] - if outformat == 'plotly_scatters': - return data - figure = go.Figure(data=data, layout=go.Layout(_build_title('Daily Flow Standard Deviation', titles))) - if outformat == 'plotly': - return figure - elif outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly or plotly_html') - - -def daily_stats(hist: pd.DataFrame, titles: dict = None, outformat: str = 'plotly') -> go.Figure: - """ - Plots a graph with statistics for each day of year - - Args: - hist: dataframe of values to plot - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - outformat: either 'plotly' (python object, default), 'plotly_scatters', or 'plotly_html' - - returns: - plot of the graph of the low flows - """ - - stats_df = compute_daily_statistics(hist) - - data = [ - go.Scatter( - x=stats_df.index.tolist(), - y=stats_df[column].tolist(), - name=column - ) for column in stats_df.columns - ] - - if outformat == 'plotly_scatters': - return data - layout = go.Layout( - title=_build_title('Daily Average Streamflow (Simulated)', titles), - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'}, - ) - figure = go.Figure(data=data, layout=layout) - if outformat == 'plotly': - return figure - elif outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly or plotly_html') - - -def monthly_averages(monavg: pd.DataFrame, titles: dict = False, outformat: str = 'plotly') -> go.Figure: - """ - Makes the daily_averages data and metadata into a plotly plot - - Args: - monavg: the csv response from monthly_averages - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - outformat: either 'plotly', or 'plotly_html' (default plotly) - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick plotly, plotly_scatters, or plotly_html') - - scatter_plots = [ - go.Scatter( - name='Average Monthly Flow', - x=pd.to_datetime(monavg.index, format='%m').strftime('%B'), - y=monavg.values.flatten(), - line=dict(color='blue') - ), - ] - if outformat == 'plotly_scatters': - return scatter_plots - layout = go.Layout( - title=_build_title('Monthly Average Streamflow (Simulated)', titles), - yaxis={'title': 'Streamflow (m3/s)'}, - xaxis={'title': 'Month'}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly, plotly_scatters, or plotly_html') - - -def flow_duration_curve(hist: pd.DataFrame, titles: dict = False, outformat: str = 'plotly') -> go.Figure: - """ - Makes the streamflow ensemble data and metadata into a plotly plot - - Args: - hist: the csv response from historic_simulation - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - outformat: either 'json', 'plotly', or 'plotly_html' (default plotly) - - Return: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if outformat not in ['json', 'plotly_scatters', 'plotly', 'plotly_html']: - raise ValueError('invalid outformat specified. pick json, plotly, plotly_scatters, or plotly_html') - - # process the hist dataframe to create the flow duration curve - sorted_hist = hist.values.flatten() - sorted_hist.sort() - - # ranks data from smallest to largest - ranks = len(sorted_hist) - scipy.stats.rankdata(sorted_hist, method='average') - - # calculate probability of each rank - prob = [(ranks[i] / (len(sorted_hist) + 1)) for i in range(len(sorted_hist))] - - plot_data = { - 'x_probability': prob, - 'y_flow': sorted_hist, - 'y_max': sorted_hist[0], - } - - if outformat == 'json': - return plot_data - - scatter_plots = [ - go.Scatter( - name='Flow Duration Curve', - x=plot_data['x_probability'], - y=plot_data['y_flow']) - ] - if outformat == 'plotly_scatters': - return scatter_plots - layout = go.Layout( - title=_build_title('Flow Duration Curve', titles), - xaxis={'title': 'Exceedence Probability'}, - yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - ) - figure = go.Figure(scatter_plots, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose json, plotly, plotly_scatters, or plotly_html') - - -# STREAMFLOW HTML TABLES -def probabilities_table(stats: pd.DataFrame, ensem: pd.DataFrame, rperiods: pd.DataFrame) -> str: - """ - Processes the results of forecast_stats , forecast_ensembles, and return_periods and uses jinja2 template - rendering to generate html code that shows the probabilities of exceeding the return period flow on each day. - - Args: - stats: the csv response from forecast_stats - ensem: the csv response from forecast_ensembles - rperiods: the csv response from return_periods - - Return: - string containing html to build a table with a row of dates and for exceeding each return period threshold - """ - dates = stats.index.tolist() - startdate = dates[0] - enddate = dates[-1] - span = enddate - startdate - uniqueday = [startdate + datetime.timedelta(days=i) for i in range(span.days + 2)] - # get the return periods for the stream reach - rp2 = rperiods['return_period_2'].values - rp5 = rperiods['return_period_5'].values - rp10 = rperiods['return_period_10'].values - rp25 = rperiods['return_period_25'].values - rp50 = rperiods['return_period_50'].values - rp100 = rperiods['return_period_100'].values - # fill the lists of things used as context in rendering the template - days = [] - r2 = [] - r5 = [] - r10 = [] - r25 = [] - r50 = [] - r100 = [] - for i in range(len(uniqueday) - 1): # (-1) omit the extra day used for reference only - tmp = ensem.loc[uniqueday[i]:uniqueday[i + 1]] - days.append(uniqueday[i].strftime('%b %d')) - num2 = 0 - num5 = 0 - num10 = 0 - num25 = 0 - num50 = 0 - num100 = 0 - for column in tmp: - column_max = tmp[column].to_numpy().max() - if column_max > rp100: - num100 += 1 - if column_max > rp50: - num50 += 1 - if column_max > rp25: - num25 += 1 - if column_max > rp10: - num10 += 1 - if column_max > rp5: - num5 += 1 - if column_max > rp2: - num2 += 1 - r2.append(round(num2 * 100 / 52)) - r5.append(round(num5 * 100 / 52)) - r10.append(round(num10 * 100 / 52)) - r25.append(round(num25 * 100 / 52)) - r50.append(round(num50 * 100 / 52)) - r100.append(round(num100 * 100 / 52)) - path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates', 'probabilities_table.html')) - with open(path) as template: - return jinja2.Template(template.read()).render(days=days, r2=r2, r5=r5, r10=r10, r25=r25, r50=r50, r100=r100, - colors=_plot_colors()) - - -def return_periods_table(rperiods: pd.DataFrame) -> str: - """ - Processes the dataframe response from the return_periods function and uses jinja2 to render an html string for a - table showing each of the return periods - - Args: - rperiods: the dataframe from the return periods function - - Returns: - string of html - """ - - # find the correct template to render - path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates', 'return_periods_table.html')) - # work on a copy of the dataframe so you dont ruin the original - tmp = rperiods - reach_id = str(tmp.index[0]) - js = json.loads(tmp.to_json()) - rp = {} - for i in js: - if i.startswith('return_period'): - year = i.split('_')[-1] - rp[f'{year} Year'] = js[i][reach_id] - elif i == 'max_flow': - rp['Max Simulated Flow'] = js[i][reach_id] - - rp = {key: round(value, 2) for key, value in sorted(rp.items(), key=lambda item: item[1])} - - with open(path) as template: - return jinja2.Template(template.read()).render(reach_id=reach_id, rp=rp, colors=_plot_colors()) - - -# BIAS CORRECTION PLOTS -def corrected_historical(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, - rperiods: pd.DataFrame = None, titles: dict = None, - outformat: str = 'plotly') -> go.Figure or str: - """ - Creates a plot of corrected discharge, observered discharge, and simulated discharge - - Args: - corrected: the response from the geoglows.bias.correct_historical_simulation function\ - simulated: the csv response from historic_simulation - observed: the dataframe of observed data. Must have a datetime index and a single column of flow values - rperiods: the csv response from return_periods - outformat: either 'plotly', or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Returns: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - startdate = corrected.index[0] - enddate = corrected.index[-1] - - plot_data = { - 'x_simulated': corrected.index.tolist(), - 'x_observed': observed.index.tolist(), - 'y_corrected': corrected.values.flatten(), - 'y_simulated': simulated.values.flatten(), - 'y_observed': observed.values.flatten(), - 'y_max': max(corrected.values.max(), observed.values.max(), simulated.values.max()), - } - if rperiods is not None: - plot_data.update(rperiods.to_dict(orient='index').items()) - rperiod_scatters = _rperiod_scatters(startdate, enddate, rperiods, plot_data['y_max'], plot_data['y_max']) - else: - rperiod_scatters = [] - - if outformat == 'json': - return plot_data - - scatters = [ - go.Scatter( - name='Simulated Data', - x=plot_data['x_simulated'], - y=plot_data['y_simulated'], - line=dict(color='red') - ), - go.Scatter( - name='Observed Data', - x=plot_data['x_observed'], - y=plot_data['y_observed'], - line=dict(color='blue') - ), - go.Scatter( - name='Corrected Simulated Data', - x=plot_data['x_simulated'], - y=plot_data['y_corrected'], - line=dict(color='#00cc96') - ), - ] - scatters += rperiod_scatters - - layout = go.Layout( - title=_build_title("Historical Simulation Comparison", titles), - yaxis={'title': 'Discharge (m3/s)'}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate], 'hoverformat': '%b %d %Y', - 'tickformat': '%Y'}, - ) - - figure = go.Figure(data=scatters, layout=layout) - if outformat == 'plotly': - return figure - if outformat == 'plotly_html': - return offline_plot( - figure, - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose json, plotly, plotly_scatters, or plotly_html') - - -def corrected_scatterplots(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, - merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, - titles: dict = None, outformat: str = 'plotly') -> go.Figure or str: - """ - Creates a plot of corrected discharge, observered discharge, and simulated discharge. This function uses - hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full - comparison of bias correction, you can provide them to save time - - Args: - corrected: the response from the geoglows.bias.correct_historical_simulation function - simulated: the csv response from historic_simulation - observed: the dataframe of observed data. Must have a datetime index and a single column of flow values - merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) - merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) - outformat: either 'plotly' or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Returns: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if corrected is False and simulated is False and observed is False: - if merged_sim_obs is not False and merged_cor_obs is not False: - pass # if you provided the merged dataframes already, we use those - else: - # merge the datasets together - merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) - merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) - - # get the min/max values for plotting the 45 degree line - min_value = min(min(merged_sim_obs.iloc[:, 1].values), min(merged_sim_obs.iloc[:, 0].values)) - max_value = max(max(merged_sim_obs.iloc[:, 1].values), max(merged_sim_obs.iloc[:, 0].values)) - - # do a linear regression on both of the merged dataframes - slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(merged_sim_obs.iloc[:, 0].values, - merged_sim_obs.iloc[:, 1].values) - slope2, intercept2, r_value2, p_value2, std_err2 = scipy.stats.linregress(merged_cor_obs.iloc[:, 0].values, - merged_cor_obs.iloc[:, 1].values) - scatter_sets = [ - go.Scatter( - x=merged_sim_obs.iloc[:, 0].values, - y=merged_sim_obs.iloc[:, 1].values, - mode='markers', - name='Original Data', - marker=dict(color='#ef553b') - ), - go.Scatter( - x=merged_cor_obs.iloc[:, 0].values, - y=merged_cor_obs.iloc[:, 1].values, - mode='markers', - name='Corrected', - marker=dict(color='#00cc96') - ), - go.Scatter( - x=[min_value, max_value], - y=[min_value, max_value], - mode='lines', - name='45 degree line', - line=dict(color='black') - ), - go.Scatter( - x=[min_value, max_value], - y=[slope * min_value + intercept, slope * max_value + intercept], - mode='lines', - name=f'Y = {round(slope, 2)}x + {round(intercept, 2)} (Original)', - line=dict(color='red') - ), - go.Scatter( - x=[min_value, max_value], - y=[slope2 * min_value + intercept2, slope2 * max_value + intercept2], - mode='lines', - name=f'Y = {round(slope2, 2)}x + {round(intercept2, 2)} (Corrected)', - line=dict(color='green') - ) - ] - - updatemenus = [ - dict(active=0, - buttons=[dict(label='Linear Scale', - method='update', - args=[{'visible': [True, True]}, - {'title': 'Linear scale', - 'yaxis': {'type': 'linear'}}]), - dict(label='Log Scale', - method='update', - args=[{'visible': [True, True]}, - {'title': 'Log scale', - 'xaxis': {'type': 'log'}, - 'yaxis': {'type': 'log'}}]), - ] - ) - ] - - layout = go.Layout(title=_build_title('Bias Correction Scatter Plot', titles), - xaxis=dict(title='Simulated', ), - yaxis=dict(title='Observed', autorange=True), - showlegend=True, updatemenus=updatemenus) - if outformat == 'plotly': - return go.Figure(data=scatter_sets, layout=layout) - elif outformat == 'plotly_html': - return offline_plot( - go.Figure(data=scatter_sets, layout=layout), - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly or plotly_html') - - -def corrected_month_average(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, - merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, - titles: dict = None, outformat: str = 'plotly') -> go.Figure or str: - """ - Calculates and plots the monthly average streamflow. This function uses - hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full - comparison of bias correction, you can provide them to save time - - Args: - corrected: the response from the geoglows.bias.correct_historical_simulation function - simulated: the csv response from historic_simulation - observed: the dataframe of observed data. Must have a datetime index and a single column of flow values - merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) - merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) - outformat: either 'plotly' or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Returns: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if corrected is False and simulated is False and observed is False: - if merged_sim_obs is not False and merged_cor_obs is not False: - pass # if you provided the merged dataframes already, we use those - else: - # merge the datasets together - merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) - merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) - monthly_avg = hd.monthly_average(merged_sim_obs) - monthly_avg2 = hd.monthly_average(merged_cor_obs) - - scatters = [ - go.Scatter(x=monthly_avg.index, y=monthly_avg.iloc[:, 1].values, name='Observed Data'), - go.Scatter(x=monthly_avg.index, y=monthly_avg.iloc[:, 0].values, name='Simulated Data'), - go.Scatter(x=monthly_avg2.index, y=monthly_avg2.iloc[:, 0].values, name='Corrected Simulated Data'), - ] - - layout = go.Layout( - title=_build_title('Monthly Average Streamflow Comparison', titles), - xaxis=dict(title='Month'), yaxis=dict(title='Discharge (m3/s)', autorange=True), - showlegend=True) - - if outformat == 'plotly': - return go.Figure(data=scatters, layout=layout) - elif outformat == 'plotly_html': - return offline_plot( - go.Figure(data=scatters, layout=layout), - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly or plotly_html') - - -def corrected_day_average(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, - merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, - titles: dict = None, outformat: str = 'plotly') -> go.Figure or str: - """ - Calculates and plots the daily average streamflow. This function uses - hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full - comparison of bias correction, you can provide them to save time - - Args: - corrected: the response from the geoglows.bias.correct_historical_simulation function - simulated: the csv response from historic_simulation - merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) - merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) - observed: the dataframe of observed data. Must have a datetime index and a single column of flow values - outformat: either 'plotly' or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Returns: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if corrected is False and simulated is False and observed is False: - if merged_sim_obs is not False and merged_cor_obs is not False: - pass # if you provided the merged dataframes already, we use those - else: - # merge the datasets together - merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) - merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) - daily_avg = hd.daily_average(merged_sim_obs) - daily_avg2 = hd.daily_average(merged_cor_obs) - - scatters = [ - go.Scatter(x=daily_avg.index, y=daily_avg.iloc[:, 1].values, name='Observed Data'), - go.Scatter(x=daily_avg.index, y=daily_avg.iloc[:, 0].values, name='Simulated Data'), - go.Scatter(x=daily_avg2.index, y=daily_avg2.iloc[:, 0].values, name='Corrected Simulated Data'), - ] - - layout = go.Layout( - title=_build_title('Daily Average Streamflow Comparison', titles), - xaxis=dict(title='Days'), yaxis=dict(title='Discharge (m3/s)', autorange=True), - showlegend=True) - - if outformat == 'plotly': - return go.Figure(data=scatters, layout=layout) - elif outformat == 'plotly_html': - return offline_plot( - go.Figure(data=scatters, layout=layout), - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly or plotly_html') - - -def corrected_volume_compare(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame, - merged_sim_obs: pd.DataFrame = False, merged_cor_obs: pd.DataFrame = False, - titles: dict = None, outformat: str = 'plotly') -> go.Figure or str: - """ - Calculates and plots the cumulative volume output on each of the 3 datasets provided. This function uses - hydrostats.data.merge_data on the 3 inputs. If you have already computed these because you are doing a full - comparison of bias correction, you can provide them to save time - - Args: - corrected: the response from the geoglows.bias.correct_historical_simulation function - simulated: the csv response from historic_simulation - observed: the dataframe of observed data. Must have a datetime index and a single column of flow values - merged_sim_obs: (optional) if you have already computed it, hydrostats.data.merge_data(simulated, observed) - merged_cor_obs: (optional) if you have already computed it, hydrostats.data.merge_data(corrected, observed) - outformat: either 'plotly' or 'plotly_html' (default plotly) - titles: (dict) Extra info to show on the title of the plot. For example: - {'Reach ID': 1234567, 'Drainage Area': '1000km^2'} - - Returns: - plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method - """ - if corrected is False and simulated is False and observed is False: - if merged_sim_obs is not False and merged_cor_obs is not False: - pass # if you provided the merged dataframes already, we use those - else: - # merge the datasets together - merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) - merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) - - sim_array = merged_sim_obs.iloc[:, 0].values - obs_array = merged_sim_obs.iloc[:, 1].values - corr_array = merged_cor_obs.iloc[:, 0].values - - sim_volume_dt = sim_array * 0.0864 - obs_volume_dt = obs_array * 0.0864 - corr_volume_dt = corr_array * 0.0864 - - sim_volume_cum = [] - obs_volume_cum = [] - corr_volume_cum = [] - sum_sim = 0 - sum_obs = 0 - sum_corr = 0 - - for i in sim_volume_dt: - sum_sim = sum_sim + i - sim_volume_cum.append(sum_sim) - - for j in obs_volume_dt: - sum_obs = sum_obs + j - obs_volume_cum.append(sum_obs) - - for k in corr_volume_dt: - sum_corr = sum_corr + k - corr_volume_cum.append(sum_corr) - - observed_volume = go.Scatter(x=merged_sim_obs.index, y=obs_volume_cum, name='Observed', ) - simulated_volume = go.Scatter(x=merged_sim_obs.index, y=sim_volume_cum, name='Simulated', ) - corrected_volume = go.Scatter(x=merged_cor_obs.index, y=corr_volume_cum, name='Corrected Simulated', ) - - layout = go.Layout( - title=_build_title('Cumulative Volume Comparison', titles), - xaxis=dict(title='Datetime', ), yaxis=dict(title='Volume (m3)', autorange=True), - showlegend=True) - - if outformat == 'plotly': - return go.Figure(data=[observed_volume, simulated_volume, corrected_volume], layout=layout) - elif outformat == 'plotly_html': - return offline_plot( - go.Figure(data=[observed_volume, simulated_volume, corrected_volume], layout=layout), - config={'autosizable': True, 'responsive': True}, - output_type='div', - include_plotlyjs=False - ) - raise ValueError('Invalid outformat chosen. Choose plotly or plotly_html') - - -# PLOTTING AUXILIARY FUNCTIONS -def _build_title(base, title_headers): - if not title_headers: - return base - if 'bias_corrected' in title_headers.keys(): - base = 'Bias Corrected ' + base - for head in title_headers: - if head == 'bias_corrected': - continue - base += f'
{head}: {title_headers[head]}' - return base - - -def _plot_colors(): - return { - '2 Year': 'rgba(254, 240, 1, .4)', - '5 Year': 'rgba(253, 154, 1, .4)', - '10 Year': 'rgba(255, 56, 5, .4)', - '20 Year': 'rgba(128, 0, 246, .4)', - '25 Year': 'rgba(255, 0, 0, .4)', - '50 Year': 'rgba(128, 0, 106, .4)', - '100 Year': 'rgba(128, 0, 246, .4)', - } - - -def _rperiod_scatters(startdate: str, enddate: str, rperiods: pd.DataFrame, y_max: float, max_visible: float = 0, - visible: bool = None): - colors = _plot_colors() - x_vals = (startdate, enddate, enddate, startdate) - r2 = int(rperiods['return_period_2'].values[0]) - if visible is None: - if max_visible > r2: - visible = True - else: - visible = 'legendonly' - - def template(name, y, color, fill='toself'): - return go.Scatter( - name=name, - x=x_vals, - y=y, - legendgroup='returnperiods', - fill=fill, - visible=visible, - line=dict(color=color, width=0)) - - if list(rperiods.columns) == ['max_flow', 'return_period_20', 'return_period_10', 'return_period_2']: - r10 = int(rperiods['return_period_10'].values[0]) - r20 = int(rperiods['return_period_20'].values[0]) - rmax = int(max(2 * r20 - r10, y_max)) - return [ - template(f'2 Year: {r2}', (r2, r2, r10, r10), colors['2 Year']), - template(f'10 Year: {r10}', (r10, r10, r20, r20), colors['10 Year']), - template(f'20 Year: {r20}', (r20, r20, rmax, rmax), colors['20 Year']), - ] - - else: - r5 = int(rperiods['return_period_5'].values[0]) - r10 = int(rperiods['return_period_10'].values[0]) - r25 = int(rperiods['return_period_25'].values[0]) - r50 = int(rperiods['return_period_50'].values[0]) - r100 = int(rperiods['return_period_100'].values[0]) - rmax = int(max(2 * r100 - r25, y_max)) - return [ - template('Return Periods', (rmax, rmax, rmax, rmax), 'rgba(0,0,0,0)', fill='none'), - template(f'2 Year: {r2}', (r2, r2, r5, r5), colors['2 Year']), - template(f'5 Year: {r5}', (r5, r5, r10, r10), colors['5 Year']), - template(f'10 Year: {r10}', (r10, r10, r25, r25), colors['10 Year']), - template(f'25 Year: {r25}', (r25, r25, r50, r50), colors['25 Year']), - template(f'50 Year: {r50}', (r50, r50, r100, r100), colors['50 Year']), - template(f'100 Year: {r100}', (r100, r100, rmax, rmax), colors['100 Year']), - ] - - -def plot_low_flows(data: pd.DataFrame, rps: tuple = (2, 5, 10, 25, 50)): - """ - Plots the graph for the standard deviation over a year and the graph of low flows by day of the year - - Args: - data: dataframe of values to plot - rps: tuple for the return periods - default is (2, 5, 10, 25, 50) - - returns: - plot of the graph of the low flows - """ - data['datetime'] = pd.to_datetime(data['datetime']) - streamflow_data = pd.DataFrame() - streamflow_data['streamflow'] = data['streamflow_m^3/s'] - streamflow_data['day'] = data['datetime'].dt.strftime('%m-%d') - daily_averages = streamflow_data.groupby('day').mean() - averages = [ - go.Scatter( - x=daily_averages.index, - y=daily_averages['streamflow'], - name="daily-averages" - ) - ] - layout = go.Layout(xaxis={'type': 'category'}, title="Low flows by day of year") - plot = go.Figure(data=averages, layout=layout) - for rp in rps: - quantile = streamflow_data.groupby('day').quantile(1 / rp) - plot.add_trace( - go.Scatter( - x=daily_averages.index, - y=quantile['streamflow'], - name=f'{rp}_year' - ) - ) - return plot diff --git a/geoglows/streamflow.py b/geoglows/streamflow.py deleted file mode 100644 index 9dcca55..0000000 --- a/geoglows/streamflow.py +++ /dev/null @@ -1,567 +0,0 @@ -import json -import os -import pickle -from collections import OrderedDict -from io import StringIO - -import pandas as pd -import requests -from shapely.geometry import Point, MultiPoint, shape -from shapely.ops import nearest_points - -ENDPOINT = 'https://geoglows.ecmwf.int/api/' - - -__all__ = ['forecast_stats', 'forecast_ensembles', 'forecast_warnings', 'forecast_records', 'historic_simulation', - 'daily_averages', 'monthly_averages', 'return_periods', 'available_data', 'available_dates', - 'available_regions', 'reach_to_region', 'reach_to_latlon', 'latlon_to_reach', 'latlon_to_region', ] - - -# FUNCTIONS THAT CALL THE GLOBAL STREAMFLOW PREDICTION API -def forecast_stats(reach_id: int, return_format: str = 'csv', forecast_date: str = None, - endpoint: str = ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves statistics that summarize the ensemble streamflow forecast for a certain reach_id - - Args: - reach_id: the ID of a stream - return_format: 'csv', 'json', 'waterml', 'url' - forecast_date: a string specifying the date to request in YYYYMMDD format - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.forecast_stats(12341234) - """ - method = 'ForecastStats/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}' - params = {'reach_id': reach_id, 'return_format': return_format} - if forecast_date is not None: - params["date"] = forecast_date - # return the requested data - return _make_request(endpoint, method, params, return_format, s) - - -def forecast_ensembles(reach_id: int, return_format: str = 'csv', forecast_date: str = None, - endpoint: str = ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves each ensemble from the most recent streamflow forecast for a certain reach_id - - Args: - reach_id: the ID of a stream - return_format: 'csv', 'json', 'waterml', 'url' - forecast_date: a string specifying the date to request in YYYYMMDD format - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.forecast_ensembles(12341234) - """ - method = 'ForecastEnsembles/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}' - - params = {'reach_id': reach_id, 'return_format': return_format} - if forecast_date is not None: - params["date"] = forecast_date - - # return the requested data - return _make_request(endpoint, method, params, return_format, s) - - -def forecast_warnings(region: str = 'all', return_format='csv', - endpoint=ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves a csv listing streams likely to experience a return period level flow during the forecast period. - - Args: - region: the name of a region as shown in the available_regions request - return_format: 'csv', 'json', 'waterml', 'request', 'url' - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.forecast_warnings('australia-geoglows') - """ - method = 'ForecastWarnings/' - - # if you only wanted the url, quit here - if return_format == 'url': - return endpoint + method + f'?region={region}' - - # return the requested data - return _make_request(endpoint, method, {'region': region, 'return_format': return_format}, return_format, s) - - -def forecast_records(reach_id: int, start_date: str = None, end_date: str = None, return_format='csv', - endpoint=ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves a csv showing the ensemble average forecasted flow for the year from January 1 to the current date - - Args: - reach_id: the ID of a stream - return_format: 'csv', 'json', 'waterml', 'url' - start_date: a string specifying the earliest date to request in YYYYMMDD format - end_date: a string specifying the latest date to request in YYYYMMDD format - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.forecast_warnings('australia-geoglows') - """ - method = 'ForecastRecords/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}' - - params = {'reach_id': reach_id, 'return_format': return_format} - if start_date is not None: - params["start_date"] = start_date - if end_date is not None: - params["end_date"] = end_date - - # return the requested data - return _make_request(endpoint, method, params, return_format, s) - - -def historic_simulation(reach_id: int, return_format='csv', forcing='era_5', - endpoint=ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves a historical streamflow simulation derived from a specified forcing for a certain reach_id - - Args: - reach_id: the ID of a stream - return_format: 'csv', 'json', 'waterml', 'url' - forcing: the runoff dataset used to drive the historic simulation (era_interim or era_5) - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.historic_simulation(12341234) - """ - method = 'HistoricSimulation/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}&forcing={forcing}' - - # return the requested data - params = {'reach_id': reach_id, 'forcing': forcing, 'return_format': return_format} - return _make_request(endpoint, method, params, return_format, s) - - -def daily_averages(reach_id: int, return_format='csv', forcing='era_5', - endpoint=ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves the average flow for every day of the year at a certain reach_id. - - Args: - reach_id: the ID of a stream - return_format: 'csv', 'json', 'waterml', 'url' - forcing: the runoff dataset used to drive the historic simulation (era_interim or era_5) - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.seasonal_average(12341234) - """ - method = 'DailyAverages/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}&forcing={forcing}' - - # return the requested data - params = {'reach_id': reach_id, 'forcing': forcing, 'return_format': return_format} - return _make_request(endpoint, method, params, return_format, s) - - -def monthly_averages(reach_id: int, return_format='csv', forcing='era_5', - endpoint=ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves the average flow for each month at a certain reach_id. - - Args: - reach_id: the ID of a stream - forcing: the runoff dataset used to drive the historic simulation (era_interim or era_5) - return_format: 'csv', 'json', 'waterml', 'url' - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.seasonal_average(12341234) - """ - method = 'MonthlyAverages/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}&forcing={forcing}' - - # return the requested data - params = {'reach_id': reach_id, 'forcing': forcing, 'return_format': return_format} - return _make_request(endpoint, method, params, return_format, s) - - -def return_periods(reach_id: int, return_format='csv', forcing='era_5', - endpoint=ENDPOINT, s: requests.Session = False) -> pd.DataFrame: - """ - Retrieves the return period thresholds based on a specified historic simulation forcing on a certain reach_id. - - Args: - reach_id: the ID of a stream - forcing: the runoff dataset used to drive the historic simulation (era_interim or era_5) - return_format: 'csv', 'json', 'waterml', 'url' - endpoint: the endpoint of an api instance - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='csv' returns a pd.DataFrame() - - return_format='json' returns a json - - return_format='waterml' returns a waterml string - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.return_periods(12341234) - """ - method = 'ReturnPeriods/' - - # if you only wanted the url, quit here - if return_format == 'url': - return f'{endpoint}{method}?reach_id={reach_id}&forcing={forcing}' - - # return the requested data - params = {'reach_id': reach_id, 'forcing': forcing, 'return_format': return_format} - - return _make_request(endpoint, method, params, return_format, s) - - -def available_data(endpoint: str = ENDPOINT, return_format='json', s: requests.Session = False) -> dict or str: - """ - Returns a dictionary with a key for each available_regions containing the available_dates for that region - - Args: - endpoint: the endpoint of an api instance - return_format: 'json' or 'url' - s: requests.Session instance connected to the api's root url - - Returns: - dict - - Example: - .. code-block:: python - - data = geoglows.streamflow.available_data() - - """ - method = 'AvailableData/' - - # if you only wanted the url, quit here - if return_format == 'url': - return endpoint + method - - # return the requested data - return _make_request(endpoint, method, {}, return_format, s) - - -def available_regions(endpoint: str = ENDPOINT, return_format='json', s: requests.Session = False) -> dict or str: - """ - Retrieves a list of regions available at the endpoint - - Args: - endpoint: the endpoint of an api instance - return_format: 'json' or 'url' - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='json' *(default)* returns {'available_regions': ['list_of_dates']} - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.available_regions(12341234) - """ - method = 'AvailableRegions/' - - if return_format == 'url': - return endpoint + method - - # return the requested data - return _make_request(endpoint, method, {}, return_format, s) - - -def available_dates(reach_id: int = None, region: str = None, return_format: str = 'json', - endpoint: str = ENDPOINT, s: requests.Session = False) -> dict or str: - """ - Retrieves the list of dates of stored streamflow forecasts. You need to specify either a reach_id or a region. - - Args: - reach_id: the ID of a stream - region: the name of a hydrologic region used in the model - endpoint: the endpoint of an api instance - return_format: 'json' or 'url' - s: requests.Session instance connected to the api's root url - - Return Format: - - return_format='json' *(default)* returns {'available_dates': ['list_of_dates']} - - return_format='url' returns a url string for using in a request or web browser - - Example: - .. code-block:: python - - data = geoglows.streamflow.available_dates(12341234) - """ - method = 'AvailableDates/' - - # you need a region for the api call, so the user needs to provide one or a valid reach_id to get it from - if region: - params = {'region': region} - elif reach_id: - params = {'region': reach_to_region(reach_id)} - else: - raise RuntimeError('specify a region or a reach_id') - - # if you only wanted the url, quit here - if return_format == 'url': - return endpoint + method - - # return the requested data - return _make_request(endpoint, method, params, return_format, s) - - -# UTILITY FUNCTIONS -def reach_to_region(reach_id: int) -> str: - """ - returns the delineation region name corresponding to the range of numbers for a given reach_id. - does not validate that the reach_id exists in the region, just associates a number to a name. - - Args: - reach_id: the ID for a stream - - Return: - the name of the delineated world region used by the API. - - Example: - region = geoglows.streamflow.reach_to_region(5000000) - """ - # Indonesia 1M's - # ------australia 2M (currently 200k's) - # Japan 3M's - # East Asia 4M's - # South Asia 5M's - # ------middle_east 6M (currently 600k's) - # Africa 7M's - # Central Asia 8M's - # South America 9M's - # West Asia 10M's - # -------central_america 11M (currently 900k's) - # Europe 12M's - # North America 13M's - - if not isinstance(reach_id, int): - reach_id = int(reach_id) - - lookup = OrderedDict([ - # IMPROPERLY NUMBERED REGIONS - ('australia-geoglows', 300000), - ('middle_east-geoglows', 700000), - ('central_america-geoglows', 1000000), - # CORRECTLY NUMBERED REGIONS - ('islands-geoglows', 2000000), - ('japan-geoglows', 4000000), - ('east_asia-geoglows', 5000000), - ('south_asia-geoglows', 6000000), - ('africa-geoglows', 8000000), - ('central_asia-geoglows', 9000000), - ('south_america-geoglows', 10000000), - ('west_asia-geoglows', 11000000), - ('europe-geoglows', 13000000), - ('north_america-geoglows', 14000000) - ]) - for region, threshold in lookup.items(): - if reach_id < threshold: - return region - raise ValueError(f'{reach_id} not in the range of reach_ids for this model') - - -def reach_to_latlon(reach_id: int, region: str = False) -> tuple: - """ - Finds the latitude and longitude of the centroid of the stream with the specified reach_id. Does not validate that - the reach_id exists, but it will raise an error if the reach_id does not exist - - Args: - reach_id: the ID for a stream - region: if known, you can specify the region of your reach_id - - Returns: - tuple(latitude, longitude) - """ - if not region: - region = reach_to_region(reach_id) - base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'geometry')) - df = pd.read_pickle(os.path.join(base_path, f'{region}-comid_lat_lon_z.pickle')) - df = df[df.index == reach_id] - if len(df.index) == 0: - raise LookupError(f'The reach_id "{reach_id}" was not found in region "{region}"') - return float(df['Lat'].values[0]), float(df['Lon'].values[0]) - - -def latlon_to_reach(lat: float, lon: float) -> dict: - """ - Uses a list of lat/lon for each reach_id to find the closest stream reach to a given lat/lon location - - Args: - lat: a valid latitude - lon: a valid longitude - - Return: - a dictionary containing the reach_id as well as the name of the region and the distance - from the provided lat and lon to the stream in units of degrees. - """ - if lat is False or lon is False: - raise ValueError('provide a valid latitude and longitude to in order to find a reach_id') - - # determine the region that the point is in - region = latlon_to_region(lat, lon) - - # switch the point because the csv's are lat/lon, backwards from what shapely expects (lon then lat) - point = Point(float(lat), float(lon)) - - # open the region csv - base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'geometry')) - df = pd.read_pickle(os.path.join(base_path, f'{region}-geoglows-comid_lat_lon_z.pickle')) - points_df = df.loc[:, "Lat":"Lon"].apply(Point, axis=1) - - # determine which point is closest - multi_pt = MultiPoint(points_df.tolist()) - nearest_pt = nearest_points(point, multi_pt) - reach_id = int(points_df[points_df == nearest_pt[1]].index[0]) - distance = nearest_pt[0].distance(nearest_pt[1]) - - # if the nearest stream if more than .1 degrees away, you probably didn't find the right stream - if distance > 0.11: - return {"error": "Nearest river is more than ~10km away."} - else: - return dict(reach_id=reach_id, region=region, distance=distance) - - -def latlon_to_region(lat: float, lon: float) -> str: - """ - Uses a the bounding boxes of each region to determine which contains the given lat/lon - - Args: - lat: a valid latitude - lon: a valid longitude - - Return: - the name of a region - """ - if lat is False or lon is False: - raise ValueError('provide a valid latitude and longitude to in order to find a region') - - # open the bounding boxes csv, figure out which regions the point lies within - point = Point(float(lon), float(lat)) - bounds_pickle = os.path.abspath(os.path.join(os.path.dirname(__file__), 'geometry', 'boundaries.pickle')) - with open(bounds_pickle, 'rb') as f: - region_bounds = json.loads(pickle.load(f)) - for region in region_bounds: - for polygon in region_bounds[region]['features']: - if shape(polygon['geometry']).contains(point): - return region - # if there weren't any regions, return that there was an error - raise ValueError('This point is not within any of the supported delineation regions.') - - -# API AUXILIARY FUNCTION -def _make_request(endpoint: str, method: str, params: dict, return_format: str, s: requests.Session = False): - if return_format == 'request': - params['return_format'] = 'csv' - - # request the data from the API - if s: - data = s.get(endpoint + method, params=params) - else: - data = requests.get(endpoint + method, params=params) - if data.status_code != 200: - raise RuntimeError('Recieved an error from the Streamflow REST API: ' + data.text) - - # process the response from the API as appropriate to make the corresponding python object - if return_format == 'csv': - tmp = pd.read_csv(StringIO(data.text), index_col=0) - if 'z' in tmp.columns: - del tmp['z'] - if method in ('ForecastWarnings/', 'ReturnPeriods/', 'DailyAverages/', 'MonthlyAverages/'): - return tmp - if method == 'SeasonalAverage/': - tmp.index = pd.to_datetime(tmp.index + 1, format='%j').strftime('%b %d') - return tmp - tmp.index = pd.to_datetime(tmp.index) - return tmp - elif return_format == 'json': - return json.loads(data.text) - elif return_format == 'waterml': - return data.text - else: - raise ValueError(f'Unsupported return format requested: {return_format}') diff --git a/geoglows/streams.py b/geoglows/streams.py new file mode 100644 index 0000000..f83ee71 --- /dev/null +++ b/geoglows/streams.py @@ -0,0 +1,27 @@ +import numpy as np + +from .data import metadata_tables + +__all__ = ['reach_to_vpu', 'latlon_to_reach', 'reach_to_latlon', ] + + +def reach_to_vpu(reach_id: int) -> str or int: + return ( + metadata_tables(columns=['LINKNO', 'VPUCode']) + .loc[lambda x: x['LINKNO'] == reach_id, 'VPUCode'] + .values[0] + ) + + +def latlon_to_reach(lat: float, lon: float) -> int: + df = metadata_tables(columns=['LINKNO', 'lat', 'lon']) + df['dist'] = ((df['lat'] - lat) ** 2 + (df['lon'] - lon) ** 2) ** 0.5 + return df.loc[lambda x: x['dist'] == df['dist'].min(), 'LINKNO'].values[0] + + +def reach_to_latlon(reach_id: int) -> np.ndarray: + return ( + metadata_tables(columns=['LINKNO', 'lat', 'lon']) + .loc[lambda x: x['LINKNO'] == reach_id, ['lat', 'lon']] + .values[0] + ) diff --git a/geoglows/tables.py b/geoglows/tables.py new file mode 100644 index 0000000..8803649 --- /dev/null +++ b/geoglows/tables.py @@ -0,0 +1,98 @@ +import pandas as pd + +__all__ = [ + 'flood_probabilities', + 'return_periods', +] + + +def flood_probabilities(ensem: pd.DataFrame, rperiods: pd.DataFrame) -> str: + """ + Processes the results of forecast_ensembles and return_periods and shows the probabilities of exceeding the + return period flow on each day. + + Args: + ensem: the csv response from forecast_ensembles + rperiods: the csv response from return_periods + + Return: + string containing html to build a table with a row of dates and for exceeding each return period threshold + """ + ens = ensem.drop(columns=['ensemble_52']).dropna() + ens = ens.groupby(ens.index.date).max() + ens.index = pd.to_datetime(ens.index).strftime('%Y-%m-%d') + # for each return period, get the percentage of columns with a value greater than the return period on each day + percent_series = {rp: (ens > rperiods[rp].values[0]).mean(axis=1).values.tolist() for rp in rperiods} + percent_series = pd.DataFrame(percent_series, index=ens.index) + percent_series.index.name = 'Date' + percent_series.columns = [f'{c} Year' for c in percent_series.columns] + percent_series = percent_series * 100 + percent_series = percent_series.round(1) + percent_series = percent_series.reset_index() + + colors = { + '2 Year': 'rgba(254, 240, 1, {0})', + '5 Year': 'rgba(253, 154, 1, {0})', + '10 Year': 'rgba(255, 56, 5, {0})', + '20 Year': 'rgba(128, 0, 246, {0})', + '25 Year': 'rgba(255, 0, 0, {0})', + '50 Year': 'rgba(128, 0, 106, {0})', + '100 Year': 'rgba(128, 0, 246, {0})', + } + + headers = "".join([f'{x}' for x in percent_series.columns]) + rows = [] + for row_idx, row in enumerate(percent_series.values.tolist()): + cells = [] + for col_idx, cell in enumerate(row): + if col_idx == 0: + cells.append(f'{cell}') + continue + column_name = percent_series.columns[col_idx] + stylestring = f'background-color: {colors[column_name].format(round(cell * 0.005, 2))}' + cells.append(f'{cell}') + rows.append(f'{"".join(cells)}') + rows = "".join(rows) + + return f'{headers}{rows}
' + + +def return_periods(rperiods: pd.DataFrame) -> str: + """ + Makes an html string of a table of return periods and river id numbers + + Args: + rperiods: the dataframe from the return periods function + + Returns: + string of html + """ + colors = { + '2 Year': 'rgba(254, 240, 1, .4)', + '5 Year': 'rgba(253, 154, 1, .4)', + '10 Year': 'rgba(255, 56, 5, .4)', + '20 Year': 'rgba(128, 0, 246, .4)', + '25 Year': 'rgba(255, 0, 0, .4)', + '50 Year': 'rgba(128, 0, 106, .4)', + '100 Year': 'rgba(128, 0, 246, .4)', + } + rperiods = rperiods.astype(float).round(2).astype(str) + rperiods.columns = [f'{c} Year' for c in rperiods.columns] + rperiods.index = rperiods.index.astype(int).astype(str) + rperiods = rperiods.reset_index() + + headers = "".join([f'{x}' for x in rperiods.columns]) + rows = [] + for row in rperiods.values.tolist(): + cells = [] + for col_idx, cell in enumerate(row): + if col_idx == 0: + cells.append(f'{cell}') + continue + column_name = rperiods.columns[col_idx] + stylestring = f'background-color: {colors[column_name]}' + cells.append(f'{cell}') + rows.append(f'{"".join(cells)}') + rows = "".join(rows) + + return f'{headers}{rows}
' diff --git a/geoglows/templates/probabilities_table.html b/geoglows/templates/probabilities_table.html deleted file mode 100644 index 23a0534..0000000 --- a/geoglows/templates/probabilities_table.html +++ /dev/null @@ -1,71 +0,0 @@ -

Percent of Ensembles that Exceed Return Periods

- - - - - {% for day in days %} - - {% endfor %} - - - - {% for r in r2 %} - {% if r != 0 %} - - {% else %} - - {% endif %} - {% endfor %} - - - - {% for r in r5 %} - {% if r != 0 %} - - {% else %} - - {% endif %} - {% endfor %} - - - - {% for r in r10 %} - {% if r != 0 %} - - {% else %} - - {% endif %} - {% endfor %} - - - - {% for r in r25 %} - {% if r != 0 %} - - {% else %} - - {% endif %} - {% endfor %} - - - - {% for r in r50 %} - {% if r != 0 %} - - {% else %} - - {% endif %} - {% endfor %} - - - - {% for r in r100 %} - {% if r != 0 %} - - {% else %} - - {% endif %} - {% endfor %} - - -
Dates{{ day }}
2-yr Return Period{{ r }}%-
5-yr Return Period{{ r }}%-
10-yr Return Period{{ r }}%-
25-yr Return Period{{ r }}%-
50-yr Return Period{{ r }}%-
100-yr Return Period{{ r }}%-
\ No newline at end of file diff --git a/geoglows/templates/return_periods_table.html b/geoglows/templates/return_periods_table.html deleted file mode 100644 index c81dd05..0000000 --- a/geoglows/templates/return_periods_table.html +++ /dev/null @@ -1,17 +0,0 @@ -
-Return Periods for Stream {{ reach_id }} (m^3/s) - - - - {% for r in rp %} - - {% endfor %} - - - {% for r in rp %} - - {% endfor %} - - -
{{ r }}
{{ rp[r] }}
-
\ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9cbf7e5..85be377 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,13 @@ +dask +fastparquet requests pandas -plotly>=4.6 -jinja2 -shapely -scipy -numpy +plotly>=5 +shapely>=2 +scipy>=1 +s3fs +numpy>=1 hydrostats -HydroErr \ No newline at end of file +HydroErr +xarray +zarr \ No newline at end of file diff --git a/setup.py b/setup.py index 21776e9..3547621 100644 --- a/setup.py +++ b/setup.py @@ -1,28 +1,44 @@ -from setuptools import setup +import re + +from setuptools import setup, find_packages + +NAME = 'geoglows' +DESCRIPTION = 'Package for accessing data from the GEOGLOWS Hydrological Model' +URL = 'https://data.geoglows.org' +AUTHOR = 'Riley Hales PhD' +REQUIRES_PYTHON = '>=3.10.0' +LICENSE = 'BSD 3-Clause Clear License' with open("README.md", "r") as readme: - long_description = readme.read() + LONG_DESCRIPTION = readme.read() + +with open(f'./{NAME}/__init__.py') as f: + version_pattern = r'__version__ = [\'"](.+)[\'"]' + VERSION = re.search(version_pattern, f.read()).group(1) -with open('requirements.txt', 'r') as req: - install_requires = req.read().splitlines() +with open('./requirements.txt') as f: + INSTALL_REQUIRES = f.read().splitlines() setup( - name='geoglows', - packages=['geoglows'], - version='0.27.1', - description='Package for accessing data and APIs developed for the GEOGloWS initiative', - long_description=long_description, + name=NAME, + packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]), + version=VERSION, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", - author='Riley Hales', - url='https://geoglows.org', - project_urls=dict(Documentation='https://geoglows.readthedocs.io', - Source='https://github.com/BYU-Hydroinformatics/geoglows'), - license='BSD 3-Clause', + author=AUTHOR, + url=URL, + project_urls=dict( + Homepage='https://data.geoglows.org', + Documentation='https://geoglows.readthedocs.io', + Source='https://github.com/geoglows/pygeoglows', + ), + license=LICENSE, license_family='BSD', package_data={'': ['*.ipynb', '*.html']}, include_package_data=True, classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering :: Hydrology', 'Topic :: Scientific/Engineering :: Visualization', @@ -30,5 +46,6 @@ 'License :: OSI Approved :: BSD License', 'Natural Language :: English', ], - install_requires=install_requires + install_requires=INSTALL_REQUIRES, + python_requires=REQUIRES_PYTHON, )