diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f000461..7c88c4dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - id: debug-statements - id: mixed-line-ending - id: no-commit-to-branch # Prevent committing to main / master - - id: check-merge-conflict # Check for files that contain merge conflict + # - id: check-merge-conflict # Check for files that contain merge conflict - repo: https://github.com/PyCQA/isort rev: 5.13.0 hooks: diff --git a/docs/api.rst b/docs/api.rst index d5f064df..aabd383b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -18,6 +18,8 @@ Metadata - :py:class:`~data.core.metadata.Metadata` - :py:class:`~data.core.metadata.RawMetadata` - :py:class:`~data.readers.grib.metadata.GribMetadata` +- :py:class:`~data.readers.grib.metadata.GribFieldMetadata` +- :py:class:`~data.readers.grib.metadata.StandAloneGribMetadata` Numpy fields --------------- diff --git a/docs/examples/grib_metadata_object.ipynb b/docs/examples/grib_metadata_object.ipynb new file mode 100644 index 00000000..21fe8164 --- /dev/null +++ b/docs/examples/grib_metadata_object.ipynb @@ -0,0 +1,786 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d08d9138-0a0a-495c-b2f0-27b47e45847c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## GRIB: using the metadata object" + ] + }, + { + "cell_type": "raw", + "id": "25dd1879-9f88-4555-a847-a0ed45e3d6e4", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We will work with a GRIB file containing 6 messages. First we ensure the example file is available, then read the file with :ref:`from_source() `." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d1294cff-422a-4ed5-a1cf-87c6cf9195ad", + "metadata": { + "editable": true, + "raw_mimetype": "", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import earthkit.data\n", + "earthkit.data.download_example_file(\"test6.grib\")\n", + "ds = earthkit.data.from_source(\"file\", \"test6.grib\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3392799-8f51-47f3-95b1-4a53d67a2c29", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### The metadata object" + ] + }, + { + "cell_type": "raw", + "id": "b2c90171-2ce4-445c-9993-3354286ba570", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "The recommended way to get field metadata for a given key or keys is to use :py:meth:`~data.readers.grib.codes.GribField.metadata`. See :ref:`/examples/grib_metadata.ipynb` for more examples." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9daeae60-e906-4bd0-9055-e940218bb12c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'t'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[0].metadata(\"shortName\")" + ] + }, + { + "cell_type": "raw", + "id": "9282df0e-2430-44fb-9025-29691bb3a24b", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Internally the field metadata is represented by a GribFieldMetadata object. Calling :py:meth:`~data.readers.grib.codes.GribField.metadata` without any arguments will return this object." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1b795c50-f942-4417-b7e3-bd4aee4d11f5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md = ds[0].metadata()\n", + "md" + ] + }, + { + "cell_type": "markdown", + "id": "0ad46cbe-9413-435a-bf5c-db2cb8cf8203", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "It can be used as a dict." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5d5a9ad8-981e-440e-8327-9b21468ae42c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('t', 1000)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md[\"shortName\"], md.get(\"level\")" + ] + }, + { + "cell_type": "markdown", + "id": "3d422ff7-c5ef-4f8d-b633-ecd93e418f3f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "
\n", + "Warning: this object does not own a GRIB handle but contains a reference to the field, which provides access to the GRIB handle. Therefore if you need to store it for later use it is recommended to create a copy with override() (see below) to decouple it from the field object.
" + ] + }, + { + "cell_type": "markdown", + "id": "e50bd099-336f-4952-941b-aa58e93f4c70", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Creating a copy of the metadata object" + ] + }, + { + "cell_type": "raw", + "id": "6cd89c6c-bcd5-4f83-b83b-03c28447ae41", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We can create a copy with :py:meth:`~data.readers.grib.metadata.GribMetadata.override`, which always returns a new metadata object storing a cloned GRIB handle, thus it is decoupled from the field. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1369ca99-40d2-4352-8c9e-94666e5ebeec", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_copy = md.override()\n", + "md_copy" + ] + }, + { + "cell_type": "raw", + "id": "388d83a5-8165-4b3d-a121-2c7f2cb578e1", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "By default :py:meth:`~data.readers.grib.metadata.GribMetadata.override` is called with ``headers_only_clone=True`` generating the new handle with all the data values (and some related information) removed. With this the resulting object can be significantly smaller, especially if the data section is large. The downside is that now the value related keys either cannot be accessed or give back wrong values. E.g when using the \"average\" key we get:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "568fe5c9-df84-4d49-aee1-ada0e6a15c28", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(279.70703560965404, 47485.4296875)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md[\"average\"], md_copy[\"average\"]" + ] + }, + { + "cell_type": "markdown", + "id": "8796976f-a02d-4101-8cff-c14a7730d82c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "To get a copy without shrinking the GRIB handle use ``headers_only_clone=False``." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba989ad8-e034-4168-bc29-1d73877edad2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "md_copy_full = md.override(headers_only_clone=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "15d723c7-2f73-4c14-ab87-2a94de2379f8", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "279.70703560965404" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_copy_full[\"average\"]" + ] + }, + { + "cell_type": "markdown", + "id": "3240cef2-baa9-4a87-a83a-dafa97b78e43", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Changing the metadata" + ] + }, + { + "cell_type": "raw", + "id": "449f7fe0-222b-456f-bb15-dbe263774f4b", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "To change metadata values pass key value pairs or a dict to :py:meth:`~data.readers.grib.metadata.GribMetadata.override`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "40c6d232-03de-402b-82bf-8647e8a7bece", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('z', 850)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_mod = md.override(shortName=\"z\", level=\"850\")\n", + "md_mod[\"shortName\"], md_mod[\"level\"]" + ] + }, + { + "cell_type": "markdown", + "id": "31b017d2-843a-4dc5-8d49-b8c05f30ed37", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Since md_mod contains a clone of the GRIB handle the original metadata did not change." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ef78a3ec-4ea2-4ff5-8c90-e60b5e07e77f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('t', 1000)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md[\"shortName\"], md[\"level\"]" + ] + }, + { + "cell_type": "markdown", + "id": "759e4311-2419-4790-883a-84884190a8c7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Creating array fields from metadata and values" + ] + }, + { + "cell_type": "markdown", + "id": "0316233b-8da2-49a0-b8b8-4cedd6aba6b0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "GRIB metadata objects play a part in building new fieldlist from (altered) values and metadata." + ] + }, + { + "cell_type": "markdown", + "id": "a691f6ef-4b5b-47d0-accf-5e956702b47c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The following example computes the wind speed on 1000 hPa and creates a new fieldlist with a single field containing the new values. The metadata is taken from the u field and the shortName is modified." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cb59ad5f-c48b-4943-984d-3abdf48fda8d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from earthkit.data import FieldList\n", + "import numpy as np\n", + "u = ds.sel(param=\"u\", level=1000)[0]\n", + "v = ds.sel(param=\"v\", level=1000)[0]\n", + "speed = np.sqrt(u.values**2 + v.values**2)\n", + "md_speed = u.metadata().override(shortName=\"ws\")\n", + "ds_speed = FieldList.from_array(speed, md_speed) " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "85c32bfb-c929-404f-add9-9adae40418d2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
centreshortNametypeOfLevelleveldataDatedataTimestepRangedataTypenumbergridType
0ecmfwsisobaricInhPa10002018080112000an0regular_ll
\n", + "
" + ], + "text/plain": [ + " centre shortName typeOfLevel level dataDate dataTime stepRange \\\n", + "0 ecmf ws isobaricInhPa 1000 20180801 1200 0 \n", + "\n", + " dataType number gridType \n", + "0 an 0 regular_ll " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_speed.ls()" + ] + }, + { + "cell_type": "raw", + "id": "a5b3c420-f71d-4a58-8cbe-761a0ce73a9b", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "Please note that the resulting :py:class:`~data.sources.array_list.ArrayFieldList` always contains a :py:class:`~data.readers.grib.metadata.RestrictedGribMetadata` object for each field. These objects possess their own GRIB handles, which is ensured by creating a copy with ``override(headers_only_clone=True)`` when needed. On top of that metadata access is limited to keys not related to data values. Getting metadata on any other keys will throw an exception. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c6fe87ed-ee88-4f4d-a2b6-9401b364e2df", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_speed[0].metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "27686ac4-9382-4916-ad0e-be96a649d034", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Wind speed'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_speed[0].metadata(\"name\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "dc28fa77-4020-431f-ad37-e480a69f9d7f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KeyError 'average'\n" + ] + } + ], + "source": [ + "try:\n", + " ds_speed[0].metadata(\"average\")\n", + "except KeyError as e:\n", + " print(f\"KeyError {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "331fa5aa-a134-423e-a4e3-bc8adf779297", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "However, strictly speaking these keys do not represent metadata and they can be easily computed from the field values when needed." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8eab3462-3661-4fc1-9d23-8be05dc99cd8", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "7.450183054360252" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_speed[0].values.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "351236ef-3b2a-4d1e-bef9-3a7154af2270", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dev_ecc", + "language": "python", + "name": "dev_ecc" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/index.rst b/docs/examples/index.rst index f7883af2..7d992f1e 100644 --- a/docs/examples/index.rst +++ b/docs/examples/index.rst @@ -40,6 +40,7 @@ GRIB grib_selection.ipynb grib_indexing.ipynb grib_missing.ipynb + grib_metadata_object.ipynb numpy_fieldlist.ipynb grib_array_backends.ipynb grib_nearest_gridpoint.ipynb diff --git a/docs/examples/netcdf_fieldlist.ipynb b/docs/examples/netcdf_fieldlist.ipynb index 75905419..d59b035d 100644 --- a/docs/examples/netcdf_fieldlist.ipynb +++ b/docs/examples/netcdf_fieldlist.ipynb @@ -162,7 +162,7 @@ " \n", " variable\n", " level\n", - " time\n", + " valid_datetime\n", " units\n", " \n", " \n", @@ -171,126 +171,126 @@ " 0\n", " t\n", " 1000\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 1\n", " t\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 2\n", " t\n", " 700\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 3\n", " t\n", " 500\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 4\n", " t\n", " 400\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 5\n", " t\n", " 300\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 6\n", " u\n", " 1000\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 7\n", " u\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 8\n", " u\n", " 700\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 9\n", " u\n", " 500\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 10\n", " u\n", " 400\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 11\n", " u\n", " 300\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 12\n", " v\n", " 1000\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 13\n", " v\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 14\n", " v\n", " 700\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 15\n", " v\n", " 500\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 16\n", " v\n", " 400\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 17\n", " v\n", " 300\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", @@ -298,25 +298,25 @@ "" ], "text/plain": [ - " variable level time units\n", - "0 t 1000 2018-08-01 12:00:00 K\n", - "1 t 850 2018-08-01 12:00:00 K\n", - "2 t 700 2018-08-01 12:00:00 K\n", - "3 t 500 2018-08-01 12:00:00 K\n", - "4 t 400 2018-08-01 12:00:00 K\n", - "5 t 300 2018-08-01 12:00:00 K\n", - "6 u 1000 2018-08-01 12:00:00 m s**-1\n", - "7 u 850 2018-08-01 12:00:00 m s**-1\n", - "8 u 700 2018-08-01 12:00:00 m s**-1\n", - "9 u 500 2018-08-01 12:00:00 m s**-1\n", - "10 u 400 2018-08-01 12:00:00 m s**-1\n", - "11 u 300 2018-08-01 12:00:00 m s**-1\n", - "12 v 1000 2018-08-01 12:00:00 m s**-1\n", - "13 v 850 2018-08-01 12:00:00 m s**-1\n", - "14 v 700 2018-08-01 12:00:00 m s**-1\n", - "15 v 500 2018-08-01 12:00:00 m s**-1\n", - "16 v 400 2018-08-01 12:00:00 m s**-1\n", - "17 v 300 2018-08-01 12:00:00 m s**-1" + " variable level valid_datetime units\n", + "0 t 1000 2018-08-01T12:00:00 K\n", + "1 t 850 2018-08-01T12:00:00 K\n", + "2 t 700 2018-08-01T12:00:00 K\n", + "3 t 500 2018-08-01T12:00:00 K\n", + "4 t 400 2018-08-01T12:00:00 K\n", + "5 t 300 2018-08-01T12:00:00 K\n", + "6 u 1000 2018-08-01T12:00:00 m s**-1\n", + "7 u 850 2018-08-01T12:00:00 m s**-1\n", + "8 u 700 2018-08-01T12:00:00 m s**-1\n", + "9 u 500 2018-08-01T12:00:00 m s**-1\n", + "10 u 400 2018-08-01T12:00:00 m s**-1\n", + "11 u 300 2018-08-01T12:00:00 m s**-1\n", + "12 v 1000 2018-08-01T12:00:00 m s**-1\n", + "13 v 850 2018-08-01T12:00:00 m s**-1\n", + "14 v 700 2018-08-01T12:00:00 m s**-1\n", + "15 v 500 2018-08-01T12:00:00 m s**-1\n", + "16 v 400 2018-08-01T12:00:00 m s**-1\n", + "17 v 300 2018-08-01T12:00:00 m s**-1" ] }, "execution_count": 5, @@ -391,7 +391,7 @@ " \n", " variable\n", " level\n", - " time\n", + " valid_datetime\n", " units\n", " \n", " \n", @@ -400,14 +400,14 @@ " 0\n", " t\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 1\n", " t\n", " 700\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", @@ -415,9 +415,9 @@ "" ], "text/plain": [ - " variable level time units\n", - "0 t 850 2018-08-01 12:00:00 K\n", - "1 t 700 2018-08-01 12:00:00 K" + " variable level valid_datetime units\n", + "0 t 850 2018-08-01T12:00:00 K\n", + "1 t 700 2018-08-01T12:00:00 K" ] }, "execution_count": 7, @@ -514,7 +514,7 @@ { "data": { "text/plain": [ - "array([272.56485, 272.56485, 272.56485, 272.56485], dtype=float32)" + "array([272.56486405, 272.56486405, 272.56486405, 272.56486405])" ] }, "execution_count": 10, @@ -748,7 +748,7 @@ { "data": { "text/plain": [ - "NetCDFMetadata({'units': 'K', 'long_name': 'Temperature', 'standard_name': 'air_temperature', 'variable': 't', 'level': 1000, 'time': datetime.datetime(2018, 8, 1, 12, 0)})" + "NetCDFMetadata({'units': 'K', 'long_name': 'Temperature', 'standard_name': 'air_temperature', 'date': 20180801, 'time': 1200, 'variable': 't', 'level': 1000, 'levtype': 'level'})" ] }, "execution_count": 17, @@ -823,7 +823,7 @@ " \n", " variable\n", " level\n", - " time\n", + " valid_datetime\n", " units\n", " \n", " \n", @@ -832,14 +832,14 @@ " 0\n", " u\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", " 1\n", " v\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " m s**-1\n", " \n", " \n", @@ -847,9 +847,9 @@ "" ], "text/plain": [ - " variable level time units\n", - "0 u 850 2018-08-01 12:00:00 m s**-1\n", - "1 v 850 2018-08-01 12:00:00 m s**-1" + " variable level valid_datetime units\n", + "0 u 850 2018-08-01T12:00:00 m s**-1\n", + "1 v 850 2018-08-01T12:00:00 m s**-1" ] }, "execution_count": 19, @@ -890,7 +890,7 @@ " \n", " variable\n", " level\n", - " time\n", + " valid_datetime\n", " units\n", " \n", " \n", @@ -899,42 +899,42 @@ " 0\n", " t\n", " 1000\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 1\n", " t\n", " 850\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 2\n", " t\n", " 700\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 3\n", " t\n", " 500\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 4\n", " t\n", " 400\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", " 5\n", " t\n", " 300\n", - " 2018-08-01 12:00:00\n", + " 2018-08-01T12:00:00\n", " K\n", " \n", " \n", @@ -942,13 +942,13 @@ "" ], "text/plain": [ - " variable level time units\n", - "0 t 1000 2018-08-01 12:00:00 K\n", - "1 t 850 2018-08-01 12:00:00 K\n", - "2 t 700 2018-08-01 12:00:00 K\n", - "3 t 500 2018-08-01 12:00:00 K\n", - "4 t 400 2018-08-01 12:00:00 K\n", - "5 t 300 2018-08-01 12:00:00 K" + " variable level valid_datetime units\n", + "0 t 1000 2018-08-01T12:00:00 K\n", + "1 t 850 2018-08-01T12:00:00 K\n", + "2 t 700 2018-08-01T12:00:00 K\n", + "3 t 500 2018-08-01T12:00:00 K\n", + "4 t 400 2018-08-01T12:00:00 K\n", + "5 t 300 2018-08-01T12:00:00 K" ] }, "execution_count": 20, @@ -1014,6 +1014,7 @@ "}\n", "\n", "html[theme=dark],\n", + "html[data-theme=dark],\n", "body[data-theme=dark],\n", "body.vscode-dark {\n", " --xr-font-color0: rgba(255, 255, 255, 1);\n", @@ -1346,21 +1347,21 @@ " stroke: currentColor;\n", " fill: currentColor;\n", "}\n", - "
<xarray.Dataset>\n",
+       "
<xarray.Dataset> Size: 12kB\n",
        "Dimensions:    (longitude: 12, latitude: 7, level: 6, time: 1)\n",
        "Coordinates:\n",
-       "  * longitude  (longitude) float32 0.0 30.0 60.0 90.0 ... 270.0 300.0 330.0\n",
-       "  * latitude   (latitude) float32 90.0 60.0 30.0 0.0 -30.0 -60.0 -90.0\n",
-       "  * level      (level) int32 1000 850 700 500 400 300\n",
-       "  * time       (time) datetime64[ns] 2018-08-01T12:00:00\n",
+       "  * longitude  (longitude) float32 48B 0.0 30.0 60.0 90.0 ... 270.0 300.0 330.0\n",
+       "  * latitude   (latitude) float32 28B 90.0 60.0 30.0 0.0 -30.0 -60.0 -90.0\n",
+       "  * level      (level) int32 24B 1000 850 700 500 400 300\n",
+       "  * time       (time) datetime64[ns] 8B 2018-08-01T12:00:00\n",
        "Data variables:\n",
-       "    t          (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 6, 7, 12), meta=np.ndarray>\n",
-       "    u          (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 6, 7, 12), meta=np.ndarray>\n",
-       "    v          (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 6, 7, 12), meta=np.ndarray>\n",
+       "    t          (time, level, latitude, longitude) float64 4kB dask.array<chunksize=(1, 6, 7, 12), meta=np.ndarray>\n",
+       "    u          (time, level, latitude, longitude) float64 4kB dask.array<chunksize=(1, 6, 7, 12), meta=np.ndarray>\n",
+       "    v          (time, level, latitude, longitude) float64 4kB dask.array<chunksize=(1, 6, 7, 12), meta=np.ndarray>\n",
        "Attributes:\n",
        "    Conventions:  CF-1.6\n",
-       "    history:      2023-08-07 18:24:35 GMT by grib_to_netcdf-2.30.2: grib_to_n...
" + " dtype='float32', name='longitude'))
  • latitude
    PandasIndex
    PandasIndex(Index([90.0, 60.0, 30.0, 0.0, -30.0, -60.0, -90.0], dtype='float32', name='latitude'))
  • level
    PandasIndex
    PandasIndex(Index([1000, 850, 700, 500, 400, 300], dtype='int32', name='level'))
  • time
    PandasIndex
    PandasIndex(DatetimeIndex(['2018-08-01 12:00:00'], dtype='datetime64[ns]', name='time', freq=None))
  • Conventions :
    CF-1.6
    history :
    2023-08-07 18:24:35 GMT by grib_to_netcdf-2.30.2: grib_to_netcdf -o tuv_pl.nc tuv_pl.grib
  • " ], "text/plain": [ - "\n", + " Size: 12kB\n", "Dimensions: (longitude: 12, latitude: 7, level: 6, time: 1)\n", "Coordinates:\n", - " * longitude (longitude) float32 0.0 30.0 60.0 90.0 ... 270.0 300.0 330.0\n", - " * latitude (latitude) float32 90.0 60.0 30.0 0.0 -30.0 -60.0 -90.0\n", - " * level (level) int32 1000 850 700 500 400 300\n", - " * time (time) datetime64[ns] 2018-08-01T12:00:00\n", + " * longitude (longitude) float32 48B 0.0 30.0 60.0 90.0 ... 270.0 300.0 330.0\n", + " * latitude (latitude) float32 28B 90.0 60.0 30.0 0.0 -30.0 -60.0 -90.0\n", + " * level (level) int32 24B 1000 850 700 500 400 300\n", + " * time (time) datetime64[ns] 8B 2018-08-01T12:00:00\n", "Data variables:\n", - " t (time, level, latitude, longitude) float32 dask.array\n", - " u (time, level, latitude, longitude) float32 dask.array\n", - " v (time, level, latitude, longitude) float32 dask.array\n", + " t (time, level, latitude, longitude) float64 4kB dask.array\n", + " u (time, level, latitude, longitude) float64 4kB dask.array\n", + " v (time, level, latitude, longitude) float64 4kB dask.array\n", "Attributes:\n", " Conventions: CF-1.6\n", " history: 2023-08-07 18:24:35 GMT by grib_to_netcdf-2.30.2: grib_to_n..." diff --git a/docs/guide/data_format/grib.rst b/docs/guide/data_format/grib.rst index daf6f918..d028d15b 100644 --- a/docs/guide/data_format/grib.rst +++ b/docs/guide/data_format/grib.rst @@ -70,3 +70,9 @@ Examples: - :ref:`/examples/grib_metadata.ipynb` - :ref:`/examples/grib_selection.ipynb` - :ref:`/examples/grib_missing.ipynb` + + +Memory management +++++++++++++++++++++ + +See details :ref:`here `. diff --git a/docs/guide/misc/grib_memory.rst b/docs/guide/misc/grib_memory.rst new file mode 100644 index 00000000..9c25847c --- /dev/null +++ b/docs/guide/misc/grib_memory.rst @@ -0,0 +1,130 @@ +.. _grib-memory: + +GRIB field memory management +////////////////////////////// + +:ref:`grib` is a message-based binary format, where each message is regarded as a field. For reading GRIB, earthkit-data relies on :xref:`eccodes`, which, when loading a message into memory, represents it as a ``GRIB handle``. In the low level API, the GRIB handle is the object that holds the data and metadata of a GRIB field, therefore it can use up a significant amount of memory. + +Determining when a GRIB handle needs to be created and when it can be released is important for memory management. Earthkit-data provides several settings to control this behaviour depending on how we actually read the data. + +Reading GRIB data as a stream iterator +======================================== + +We can read :ref:`grib` data as a :ref:`stream ` iterator e.g. with the following code: + +.. code-block:: python + + import earthkit.data + + url = "https://get.ecmwf.int/repository/test-data/earthkit-data/examples/test6.grib" + ds = earthkit.data.from_source("url", url, stream=True) + for f in fields: + print(f) + +Here, field ``f`` is not attached to a fieldlist and only exists in the scope of the iteration (in the for loop). During its existence the field keeps the GRIB handle in memory and if used in the way shown above, only one field can exist at a time. Once the stream is consumed there is no way to access the data again (unless we read it with :func:`from_source` again). + +Reading all GRIB data from a stream into memory +=============================================== + +We can load :ref:`grib` data fully into memory when we read it as a :ref:`stream ` with the ``read_all=True`` option in :func:`from_source`. + +.. code-block:: python + + import earthkit.data + + url = "https://get.ecmwf.int/repository/test-data/earthkit-data/examples/test6.grib" + ds = earthkit.data.from_source("url", url, stream=True, read_all=True) + +With this, the entire ``ds`` fieldlist, including all the fields and the related GRIB handles, are stored in memory. + +Reading data from disk and managing its memory +============================================== + +When reading :ref:`grib` data from disk as a :ref:`file source `, it is represented as a fieldlist and loaded lazily. After the (fast) initial scan for field offsets and lengths, no actual fields are created and no data is read into memory. When we start using the fieldlist, e.g. by iterating over the fields, accessing data or metadata etc., the fields will be created **on demand** and the related GRIB handles will be loaded from disk **when needed**. Whether this data or part of it stays in memory depends on the following :ref:`settings `: + +- :ref:`grib-field-policy ` +- :ref:`grib-handle-policy ` +- :ref:`grib-handle-cache-size ` +- :ref:`use-grib-metadata-cache ` + +.. _grib-field-policy: + +grib-field-policy +++++++++++++++++++++++++++++ + +Controls whether fields are kept in memory. The default is ``"persistent"``. The possible values are: + +- ``"persistent"``: fields are kept in memory until the fieldlist is deleted +- ``"temporary"``: fields are deleted when they go out of scope and recreated on demand + +The actual memory used by a field depends on whether it owns the GRIB handle of the related GRIB message. This is controlled by the :ref:`grib-handle-policy ` settings. + +A field can also cache its metadata access for performance, thus increasing memory usage. This is controlled by the :ref:`use-grib-metadata-cache ` settings. + +.. _grib-handle-policy: + +grib-handle-policy +++++++++++++++++++++++++++++ + +Controls whether GRIB handles are kept in memory. The default is ``"cache"``. The possible values are: + +- ``"cache"``: a separate in-memory LRU cache is created for the GRIB handles in the fieldlist. The maximum number of GRIB handles kept in this cache is controlled by :ref:`grib-handle-cache-size `. In this mode, field objects are lightweight and only store the GRIB handle cache index, and can only access the GRIB handles via the cache. +- ``"persistent"``: once a GRIB handle is created, a field keeps it in memory until the field is deleted +- ``"temporary"``: for each call to data and metadata access on a field, a new GRIB handle is created and released once the access has finished. + +.. _grib-handle-cache-size: + +grib-handle-cache-size +++++++++++++++++++++++++++++ + +When :ref:`grib-handle-policy ` is ``"cache"``, the setting ``grib-handle-cache-size`` (default is ``1``) specifies the maximum number of GRIB handles kept in an in-memory cache per fieldlist. This is an LRU cache, so when it is full, the least recently used GRIB handle is removed and a new GRIB message is loaded from disk and added to the cache. + +.. _use-grib-metadata-cache: + +use-grib-metadata-cache ++++++++++++++++++++++++++++++++++++ + +When ``use-grib-metadata-cache`` is ``True`` (this is the default) all the fields will cache their metadata access. This is an in-memory cache attached to the field and implemented for the low-level metadata accessor for individual keys. This cache can be useful when the same metadata keys are accessed multiple times for the same field. + + +Overriding the settings +++++++++++++++++++++++++++++ + +In addition to changing the :ref:`settings` themselves, it is possible to override the 4 parameters above when loading a given fieldlist by passing them as keyword arguments to :func:`from_source`. The parameter names are the same but the dashes are replaced by underscores. When a parameter is not specified in :func:`from_source` or is set to None, its value is taken from the actual :ref:`settings`. E.g.: + +.. code-block:: python + + import earthkit.data + + ds = earthkit.data.from_source( + "file", + "test6.grib", + grib_field_policy="persistent", + grib_handle_policy="temporary", + grib_handle_cache_size=0, + use_grib_metadata_cache=True, + ) + + +Reading data from disk as a stream +++++++++++++++++++++++++++++++++++ + +Whilst the usual way of reading GRIB data from disk loads fields lazily (i.e. only when they are actually used), it is also possible to read all +fields up-front and keep them in memory by reading it as a :ref:`stream source ` with the ``read_all=True`` option. + +.. code-block:: python + + import earthkit.data + + f = open("test6.grib", "rb") + ds = earthkit.data.from_source("stream", f, read_all=True) + +.. warning:: + + Use this option carefully since your data might not fit into memory. + + + +.. note:: + The default settings are chosen to keep the memory usage low and the performance high. However, depending on the use case, the settings can be adjusted to optimize the memory + usage and performance. diff --git a/docs/skip_api_rules.py b/docs/skip_api_rules.py index 8e247d09..9eb514c8 100644 --- a/docs/skip_api_rules.py +++ b/docs/skip_api_rules.py @@ -103,6 +103,26 @@ "statistics", "xarray_open_dataset_kwargs", ], + "data.source.numpy_list.ArrayField": ["merge", "mutate"], + "data.source.numpy_list.ArrayFieldList": [ + "cache_file", + "dataset", + "from_dict", + "from_mask", + "from_multi", + "from_slice", + "full", + "graph", + "ignore", + "merge", + "mutate", + "new_mask_index", + "parent", + "scaled", + "settings", + "statistics", + "xarray_open_dataset_kwargs", + ], } @@ -120,8 +140,10 @@ def _skip_api_items(app, what, name, obj, skip, options): "data.readers.bufr.bufr", "data.readers.grib.codes", "data.readers.grib.index", + "data.readers.grib.metadata", "data.readers.csv", "data.sources", + "data.sources.array_list", "data.sources.numpy_list", "data.utils", "data.utils.bbox", @@ -136,9 +158,11 @@ def _skip_api_items(app, what, name, obj, skip, options): "data.readers.bufr.bufr", "data.readers.grib", "data.readers.grib.index", + "data.readers.grib.metadata", "data.readers.csv", "data.sources", "data.sources.numpy_list", + "data.sources.numpy_list", "data.utils", "data.utils.bbox", ]: @@ -154,9 +178,14 @@ def _skip_api_items(app, what, name, obj, skip, options): "data.readers.grib.codes.GribField", "data.readers.grib.index.GribFieldList", "data.readers.grib.metadata.GribMetadata", + "data.readers.grib.metadata.GribFieldMetadata", + "data.readers.grib.metadata.StandAloneGribMetadata", + "data.readers.grib.metadata.RestrictedGribMetadata", "data.readers.csv.CSVReader", "data.sources.numpy_list.NumpyField", "data.sources.numpy_list.NumpyFieldList", + "data.sources.array_list.ArrayField", + "data.sources.array_list.ArrayFieldList", "data.utils.bbox.BoundingBox", ]: skip = True diff --git a/environment.yml b/environment.yml index 856d6788..3e3a029d 100644 --- a/environment.yml +++ b/environment.yml @@ -30,6 +30,7 @@ dependencies: - eccovjson>=0.0.5 - earthkit-geo>=0.2.0 - tqdm>=4.63.0 +- lru-dict - markdown - make - mypy diff --git a/pyproject.toml b/pyproject.toml index 46c99f53..20335430 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ "filelock", "jinja2", "jsonschema", + "lru-dict", "markdown", "multiurl", "netcdf4", @@ -87,7 +88,6 @@ optional-dependencies.dev = [ "pytest-forked", "pytest-timeout", ] - optional-dependencies.eccovjson = [ "eccovjson>=0.0.5", ] diff --git a/src/earthkit/data/core/__init__.py b/src/earthkit/data/core/__init__.py index f0c3f0f8..00a3a4be 100644 --- a/src/earthkit/data/core/__init__.py +++ b/src/earthkit/data/core/__init__.py @@ -98,12 +98,11 @@ def order_by(self, *args, **kwargs): """Reorder the elements of the object.""" self._not_implemented() - def unique_values(self, *coords, remapping=None, patches=None, progress_bar=True): + def unique_values(self, *coords, remapping=None, patches=None, progress_bar=False): """Given a list of metadata attributes, such as date, param, levels, returns the list of unique values for each attributes """ from earthkit.data.core.order import build_remapping - from earthkit.data.utils.progbar import progress_bar assert len(coords) assert all(isinstance(k, str) for k in coords), coords @@ -112,10 +111,13 @@ def unique_values(self, *coords, remapping=None, patches=None, progress_bar=True iterable = self if progress_bar: - iterable = progress_bar( - iterable=self, - desc=f"Finding coords in dataset for {coords}", - ) + from earthkit.data.utils.progbar import progress_bar + + if progress_bar: + iterable = progress_bar( + iterable=self, + desc=f"Finding coords in dataset for {coords}", + ) vals = defaultdict(dict) for f in iterable: diff --git a/src/earthkit/data/core/fieldlist.py b/src/earthkit/data/core/fieldlist.py index f823e798..c2ca9425 100644 --- a/src/earthkit/data/core/fieldlist.py +++ b/src/earthkit/data/core/fieldlist.py @@ -10,6 +10,7 @@ import math from abc import abstractmethod from collections import defaultdict +from functools import cached_property from earthkit.data.core import Base from earthkit.data.core.index import Index @@ -22,6 +23,58 @@ from earthkit.data.utils.metadata import metadata_argument +class FieldListIndices: + def __init__(self, field_list): + self.fs = field_list + self.user_indices = dict() + + @cached_property + def default_index_keys(self): + if len(self.fs) > 0: + return self.fs[0]._metadata.index_keys() + else: + return [] + + def _index_value(self, key): + values = set() + for f in self.fs: + v = f.metadata(key, default=None) + if v is not None: + values.add(v) + + return sorted(list(values)) + + @cached_property + def default_indices(self): + indices = defaultdict(set) + keys = self.default_index_keys + for f in self.fs: + v = f.metadata(keys, default=None) + for i, k in enumerate(keys): + if v[i] is not None: + indices[k].add(v[i]) + + return {k: sorted(list(v)) for k, v in indices.items()} + + def indices(self, squeeze=False): + r = {**self.default_indices, **self.user_indices} + + if squeeze: + return {k: v for k, v in r.items() if len(v) > 1} + else: + return r + + def index(self, key): + if key in self.user_indices: + return self.user_indices[key] + + if key in self.default_index_keys: + return self.default_indices[key] + + self.user_indices[key] = self._index_value(key) + return self.user_indices[key] + + class Field(Base): r"""Represent a Field.""" @@ -115,18 +168,15 @@ def values(self): return self._array_backend.array_ns.reshape(v, n) return v - def _make_metadata(self): - r"""Create a field metadata object.""" - self._not_implemented() - @property def _metadata(self): r"""Metadata: Get the object representing the field's metadata.""" if self.__metadata is None: + # TODO: remove this legacy method self.__metadata = self._make_metadata() return self.__metadata - def to_numpy(self, flatten=False, dtype=None): + def to_numpy(self, flatten=False, dtype=None, index=None): r"""Return the values stored in the field as an ndarray. Parameters @@ -137,6 +187,9 @@ def to_numpy(self, flatten=False, dtype=None): dtype: str, numpy.dtype or None Typecode or data-type of the array. When it is :obj:`None` the default type used by the underlying data accessor is used. For GRIB it is ``float64``. + index: ndarray indexing object, optional + The index of the values and to be extracted. When it + is None all the values are extracted Returns ------- @@ -148,10 +201,12 @@ def to_numpy(self, flatten=False, dtype=None): v = numpy_backend().to_array(v, self.raw_values_backend) shape = self._required_shape(flatten) if shape != v.shape: - return v.reshape(shape) + v = v.reshape(shape) + if index is not None: + v = v[index] return v - def to_array(self, flatten=False, dtype=None, array_backend=None): + def to_array(self, flatten=False, dtype=None, array_backend=None, index=None): r"""Return the values stored in the field in the format of :attr:`array_backend`. @@ -163,6 +218,9 @@ def to_array(self, flatten=False, dtype=None, array_backend=None): dtype: str, array.dtype or None Typecode or data-type of the array. When it is :obj:`None` the default type used by the underlying data accessor is used. For GRIB it is ``float64``. + index: array indexing object, optional + The index of the values and to be extracted. When it + is None all the values are extracted Returns ------- @@ -177,17 +235,21 @@ def to_array(self, flatten=False, dtype=None, array_backend=None): ) shape = self._required_shape(flatten) if shape != v.shape: - return self._array_backend.array_ns.reshape(v, shape) + v = self._array_backend.array_ns.reshape(v, shape) + if index is not None: + v = v[index] return v - def _required_shape(self, flatten): - return self.shape if not flatten else (math.prod(self.shape),) + def _required_shape(self, flatten, shape=None): + if shape is None: + shape = self.shape + return shape if not flatten else (math.prod(shape),) def _array_matches(self, array, flatten=False, dtype=None): shape = self._required_shape(flatten) return shape == array.shape and (dtype is None or dtype == array.dtype) - def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): + def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None, index=None): r"""Return the values and/or the geographical coordinates for each grid point. Parameters @@ -201,6 +263,9 @@ def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): dtype: str, array.dtype or None Typecode or data-type of the arrays. When it is :obj:`None` the default type used by the underlying data accessor is used. For GRIB it is ``float64``. + index: array indexing object, optional + The index of the values and or the latitudes/longitudes to be extracted. When it + is None all the values and/or coordinates are extracted. Returns ------- @@ -252,18 +317,22 @@ def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): if k not in _keys: raise ValueError(f"data: invalid argument: {k}") - r = [self._to_array(_keys[k][0](dtype=dtype), source_backend=_keys[k][1]) for k in keys] - shape = self._required_shape(flatten) - if shape != r[0].shape: - # r = [x.reshape(shape) for x in r] - r = [self._array_backend.array_ns.reshape(x, shape) for x in r] + r = [] + for k in keys: + v = self._to_array(_keys[k][0](dtype=dtype), source_backend=_keys[k][1]) + shape = self._required_shape(flatten) + if shape != v.shape: + v = self._array_backend.array_ns.reshape(v, shape) + if index is not None: + v = v[index] + r.append(v) if len(r) == 1: return r[0] else: return self._array_backend.array_ns.stack(r) - def to_points(self, flatten=False, dtype=None): + def to_points(self, flatten=False, dtype=None, index=None): r"""Return the geographical coordinates in the data's original Coordinate Reference System (CRS). @@ -276,6 +345,9 @@ def to_points(self, flatten=False, dtype=None): Typecode or data-type of the arrays. When it is :obj:`None` the default type used by the underlying data accessor is used. For GRIB it is ``float64``. + index: array indexing object, optional + The index of the coordinates to be extracted. When it is None + all the values are extracted. Returns ------- @@ -303,14 +375,17 @@ def to_points(self, flatten=False, dtype=None): if shape != x.shape: x = self._array_backend.array_ns.reshape(x, shape) y = self._array_backend.array_ns.reshape(y, shape) + if index is not None: + x = x[index] + y = y[index] return dict(x=x, y=y) elif self.projection().CARTOPY_CRS == "PlateCarree": - lon, lat = self.data(("lon", "lat"), flatten=flatten, dtype=dtype) + lon, lat = self.data(("lon", "lat"), flatten=flatten, dtype=dtype, index=index) return dict(x=lon, y=lat) else: raise ValueError("to_points(): geographical coordinates in original CRS are not available") - def to_latlon(self, flatten=False, dtype=None): + def to_latlon(self, flatten=False, dtype=None, index=None): r"""Return the latitudes/longitudes of all the gridpoints in the field. Parameters @@ -322,6 +397,9 @@ def to_latlon(self, flatten=False, dtype=None): Typecode or data-type of the arrays. When it is :obj:`None` the default type used by the underlying data accessor is used. For GRIB it is ``float64``. + index: array indexing object, optional + The index of the latitudes/longitudes to be extracted. When it is None + all the values are extracted. Returns ------- @@ -335,7 +413,7 @@ def to_latlon(self, flatten=False, dtype=None): to_points """ - lon, lat = self.data(("lon", "lat"), flatten=flatten, dtype=dtype) + lon, lat = self.data(("lon", "lat"), flatten=flatten, dtype=dtype, index=index) return dict(lat=lat, lon=lon) def grid_points(self): @@ -539,7 +617,7 @@ def metadata(self, *keys, astype=None, **kwargs): '2 metre temperature' """ # when called without arguments returns the metadata object - if len(keys) == 0 and astype is None and len(kwargs) == 0: + if len(keys) == 0 and astype is None and not kwargs: return self._metadata namespace = kwargs.pop("namespace", None) @@ -659,8 +737,6 @@ class FieldList(Index): defaults to "numpy". """ - _md_indices = {} - def __init__(self, array_backend=None, **kwargs): self._array_backend = ensure_backend(array_backend) super().__init__(**kwargs) @@ -716,31 +792,9 @@ def ignore(self): else: return False - @cached_method - def _default_index_keys(self): - if len(self) > 0: - return self[0].metadata().index_keys() - else: - return [] - - def _find_index_values(self, key): - values = set() - for f in self: - v = f.metadata(key, default=None) - if v is not None: - values.add(v) - return sorted(list(values)) - - def _find_all_index_dict(self): - indices = defaultdict(set) - for f in self: - for k in self._default_index_keys(): - v = f.metadata(k, default=None) - if v is None: - continue - indices[k].add(v) - - return {k: sorted(list(v)) for k, v in indices.items()} + @cached_property + def _md_indices(self): + return FieldListIndices(self) def indices(self, squeeze=False): r"""Return the unique, sorted values for a set of metadata keys (see below) @@ -781,12 +835,7 @@ def indices(self, squeeze=False): used in :obj:`indices`. """ - if not self._md_indices: - self._md_indices = self._find_all_index_dict() - if squeeze: - return {k: v for k, v in self._md_indices.items() if len(v) > 1} - else: - return self._md_indices + return self._md_indices.indices(squeeze=squeeze) def index(self, key): r"""Return the unique, sorted values of the specified metadata ``key`` from all the fields. @@ -815,11 +864,7 @@ def index(self, key): [300, 400, 500, 700, 850, 1000] """ - if key in self.indices(): - return self.indices()[key] - - self._md_indices[key] = self._find_index_values(key) - return self._md_indices[key] + return self._md_indices.index(key) def to_numpy(self, **kwargs): r"""Return all the fields' values as an ndarray. It is formed as the array of the @@ -869,7 +914,7 @@ def to_array(self, **kwargs): @property def values(self): - r"""array-likr: Get all the fields' values as a 2D array. It is formed as the array of + r"""array-like: Get all the fields' values as a 2D array. It is formed as the array of :obj:`GribField.values ` per field. See Also @@ -893,7 +938,13 @@ def values(self): x = [f.values for f in self] return self._array_backend.array_ns.stack(x) - def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): + def data( + self, + keys=("lat", "lon", "value"), + flatten=False, + dtype=None, + index=None, + ): r"""Return the values and/or the geographical coordinates. Only works when all the fields have the same grid geometry. @@ -910,6 +961,9 @@ def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): Typecode or data-type of the arrays. When it is :obj:`None` the default type used by the underlying data accessor is used. For GRIB it is ``float64``. + index: array indexing object, optional + The index of the values to be extracted from each field. When it is None all the + values are extracted. Returns ------- @@ -962,7 +1016,7 @@ def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): keys = [keys] if "lat" in keys or "lon" in keys: - latlon = self[0].to_latlon(flatten=flatten, dtype=dtype) + latlon = self[0].to_latlon(flatten=flatten, dtype=dtype, index=index) r = [] for k in keys: @@ -971,10 +1025,9 @@ def data(self, keys=("lat", "lon", "value"), flatten=False, dtype=None): elif k == "lon": r.append(latlon["lon"]) elif k == "value": - r.extend([f.to_array(flatten=flatten, dtype=dtype) for f in self]) + r.extend([f.to_array(flatten=flatten, dtype=dtype, index=index) for f in self]) else: raise ValueError(f"data: invalid argument: {k}") - return self._array_backend.array_ns.stack(r) elif len(self) == 0: @@ -1074,7 +1127,7 @@ def _proc(keys, n): @cached_method def _default_ls_keys(self): if len(self) > 0: - return self[0].metadata().ls_keys() + return self[0]._metadata.ls_keys() else: return [] @@ -1165,7 +1218,7 @@ def _proc(): @cached_method def _describe_keys(self): if len(self) > 0: - return self[0].metadata().describe_keys() + return self[0]._metadata.describe_keys() else: return [] @@ -1226,11 +1279,14 @@ def to_points(self, **kwargs): else: raise ValueError("Fields do not have the same grid geometry") - def to_latlon(self, **kwargs): + def to_latlon(self, index=None, **kwargs): r"""Return the latitudes/longitudes shared by all the fields. Parameters ---------- + index: array indexing object, optional + The index of the latitudes/longitudes to be extracted. When it is None + all the values are extracted. **kwargs: dict, optional Keyword arguments passed to :meth:`Field.to_latlon() ` @@ -1331,9 +1387,9 @@ def bounding_box(self): @cached_method def _is_shared_grid(self): if len(self) > 0: - grid = self[0].metadata().geography._unique_grid_id() + grid = self[0]._metadata.geography._unique_grid_id() if grid is not None: - return all(f.metadata().geography._unique_grid_id() == grid for f in self) + return all(f._metadata.geography._unique_grid_id() == grid for f in self) return False @detect_out_filename diff --git a/src/earthkit/data/core/index.py b/src/earthkit/data/core/index.py index d5f81ca1..2ee3caac 100644 --- a/src/earthkit/data/core/index.py +++ b/src/earthkit/data/core/index.py @@ -105,8 +105,13 @@ def build_actions(self, kwargs): def compare_elements(self, a, b): assert callable(self.remapping), (type(self.remapping), self.remapping) - a_metadata = self.remapping(a.metadata) - b_metadata = self.remapping(b.metadata) + if self.remapping: + a_metadata = self.remapping(a.metadata) + b_metadata = self.remapping(b.metadata) + else: + a_metadata = a.metadata + b_metadata = b.metadata + for k, v in self.actions.items(): n = v(a_metadata(k, default=None), b_metadata(k, default=None)) if n != 0: @@ -192,9 +197,6 @@ def new_mask_index(self, *args, **kwargs): def __len__(self): self._not_implemented() - def _normalize_kwargs_names(self, **kwargs): - return kwargs - def sel(self, *args, remapping=None, **kwargs): """Uses metadata values to select a subset of the elements from a fieldlist-like object. @@ -287,7 +289,6 @@ def sel(self, *args, remapping=None, **kwargs): GribField(t,850,20180801,1200,0,0) """ kwargs = normalize_selection(*args, **kwargs) - kwargs = self._normalize_kwargs_names(**kwargs) if not kwargs: return self @@ -382,7 +383,6 @@ def isel(self, *args, **kwargs): """ kwargs = normalize_selection(*args, **kwargs) - kwargs = self._normalize_kwargs_names(**kwargs) if not kwargs: return self @@ -487,7 +487,6 @@ def order_by(self, *args, remapping=None, patches=None, **kwargs): GribField(u,850,20180801,1200,0,0) """ kwargs = normalize_order_by(*args, **kwargs) - kwargs = self._normalize_kwargs_names(**kwargs) remapping = build_remapping(remapping, patches) diff --git a/src/earthkit/data/core/metadata.py b/src/earthkit/data/core/metadata.py index ef8b7589..851b5ad4 100644 --- a/src/earthkit/data/core/metadata.py +++ b/src/earthkit/data/core/metadata.py @@ -9,10 +9,16 @@ from abc import ABCMeta from abc import abstractmethod +from functools import lru_cache from earthkit.data.core.constants import DATETIME from earthkit.data.core.constants import GRIDSPEC +try: + from functools import cache as memoise # noqa +except ImportError: + memoise = lru_cache + class Metadata(metaclass=ABCMeta): r"""Base class to represent metadata. @@ -22,6 +28,14 @@ class Metadata(metaclass=ABCMeta): Implemented in subclasses: :obj:`RawMetadata`, :obj:`GribMetadata`. + Parameters + ---------- + extra: dict, None + Extra key/value pairs to be added on top of the underlying metadata. Default is None. + cache: bool + Enable caching of all the calls to :meth:`get`. Default is False. The cache + is attached to the instance. + Examples -------- - :ref:`/examples/metadata.ipynb` @@ -37,6 +51,12 @@ class Metadata(metaclass=ABCMeta): extra = None + def __init__(self, extra=None, cache=False): + if extra is not None: + self.extra = extra + if cache: + self.get = memoise(self.get) + def __iter__(self): """Return an iterator over the metadata keys.""" return iter(self.keys()) @@ -146,6 +166,8 @@ def _items(self): def get(self, key, default=None, *, astype=None, raise_on_missing=False): r"""Return the value for ``key``. + When the instance is created with ``cache=True`` all the result is cached. + Parameters ---------- key: str @@ -175,13 +197,12 @@ def get(self, key, default=None, *, astype=None, raise_on_missing=False): """ if self._is_extra_key(key): - return self._get_extra_key(key, default=default, astype=astype) - if self._is_custom_key(key): - return self._get_custom_key( - key, default=default, astype=astype, raise_on_missing=raise_on_missing - ) + v = self._get_extra_key(key, default=default, astype=astype) + elif self._is_custom_key(key): + v = self._get_custom_key(key, default=default, astype=astype, raise_on_missing=raise_on_missing) else: - return self._get(key, default=default, astype=astype, raise_on_missing=raise_on_missing) + v = self._get(key, default=default, astype=astype, raise_on_missing=raise_on_missing) + return v @abstractmethod def _get(self, key, astype=None, default=None, raise_on_missing=False): @@ -192,7 +213,6 @@ def _is_extra_key(self, key): def _get_extra_key(self, key, default=None, astype=None, **kwargs): v = self.extra.get(key, default) - if astype is not None and v is not None: try: return astype(v) @@ -275,6 +295,7 @@ def datetime(self): dict of datatime.datetime Dict with items "base_time" and "valid_time". + >>> import earthkit.data >>> ds = earthkit.data.from_source("file", "tests/data/t_time_series.grib") >>> ds[4].datetime() @@ -372,6 +393,7 @@ class RawMetadata(Metadata): def __init__(self, *args, **kwargs): self._d = dict(*args, **kwargs) + super().__init__() def override(self, *args, **kwargs): d = dict(**self._d) diff --git a/src/earthkit/data/core/order.py b/src/earthkit/data/core/order.py index dbfaf378..5d8f4ae1 100644 --- a/src/earthkit/data/core/order.py +++ b/src/earthkit/data/core/order.py @@ -72,7 +72,7 @@ def _build_remapping(mapping): if mapping is None: return Remapping({}) - if not isinstance(mapping, Remapping) and isinstance(mapping, dict): + if not isinstance(mapping, (Remapping, Patch)) and isinstance(mapping, dict): return Remapping(mapping) return mapping diff --git a/src/earthkit/data/core/settings.py b/src/earthkit/data/core/settings.py index 43073f49..0567fb2b 100644 --- a/src/earthkit/data/core/settings.py +++ b/src/earthkit/data/core/settings.py @@ -212,6 +212,31 @@ def validate(self, name, value): {validator}""", validator=IntervalValidator(Interval(8, 4096)), ), + "grib-field-policy": _( + "persistent", + """GRIB field management policy for fieldlists with data on disk. {validator} + See :doc:`/guide/misc/grib_memory` for more information.""", + validator=ListValidator(["persistent", "temporary"]), + ), + "grib-handle-policy": _( + "cache", + """GRIB handle management policy for fieldlists with data on disk. {validator} + See :doc:`/guide/misc/grib_memory` for more information.""", + validator=ListValidator(["cache", "persistent", "temporary"]), + ), + "grib-handle-cache-size": _( + 1, + """Maximum number of GRIB handles cached in memory per fieldlist with data on disk. + Used when ``grib-handle-policy`` is ``cache``. + See :doc:`/guide/misc/grib_memory` for more information.""", + none_ok=True, + ), + "use-grib-metadata-cache": _( + True, + """Use in-memory cache kept in each field for GRIB metadata access in + fieldlists with data on disk. + See :doc:`/guide/misc/grib_memory` for more information.""", + ), } diff --git a/src/earthkit/data/readers/grib/codes.py b/src/earthkit/data/readers/grib/codes.py index b10163dd..c61a6e71 100644 --- a/src/earthkit/data/readers/grib/codes.py +++ b/src/earthkit/data/readers/grib/codes.py @@ -9,12 +9,14 @@ import logging import os +from collections import defaultdict +from functools import cached_property import eccodes import numpy as np from earthkit.data.core.fieldlist import Field -from earthkit.data.readers.grib.metadata import GribMetadata +from earthkit.data.readers.grib.metadata import GribFieldMetadata from earthkit.data.utils.message import CodesHandle from earthkit.data.utils.message import CodesMessagePositionIndex from earthkit.data.utils.message import CodesReader @@ -229,7 +231,7 @@ class GribCodesReader(CodesReader): class GribField(Field): - r"""Represents a GRIB message in a GRIB file. + r"""Represent a GRIB message in a GRIB file. Parameters ---------- @@ -241,21 +243,33 @@ class GribField(Field): Size of the message (in bytes) """ - def __init__(self, path, offset, length, backend): + _handle = None + + def __init__(self, path, offset, length, backend, manager=None): super().__init__(backend) self.path = path self._offset = offset self._length = length - self._handle = None + self._manager = manager @property def handle(self): - r""":class:`CodesHandle`: Gets an object providing access to the low level GRIB message structure.""" + r""":class:`CodesHandle`: Get an object providing access to the low level GRIB message structure.""" + if self._manager is not None: + handle = self._manager.handle(self, self._create_handle) + if handle is None: + raise RuntimeError(f"Could not get a handle for offset={self.offset} in {self.path}") + return handle + + # create a new handle and keep it in the field if self._handle is None: assert self._offset is not None - self._handle = GribCodesReader.from_cache(self.path).at_offset(self._offset) + self._handle = self._create_handle() return self._handle + def _create_handle(self): + return GribCodesReader.from_cache(self.path).at_offset(self.offset) + def _values(self, dtype=None): return self.handle.get_values(dtype=dtype) @@ -266,8 +280,12 @@ def offset(self): self._offset = int(self.handle.get("offset")) return self._offset - def _make_metadata(self): - return GribMetadata(self.handle) + @cached_property + def _metadata(self): + cache = False + if self._manager is not None: + cache = self._manager.use_grib_metadata_cache + return GribFieldMetadata(self, cache=cache) def __repr__(self): return "GribField(%s,%s,%s,%s,%s,%s)" % ( @@ -279,14 +297,6 @@ def __repr__(self): self._metadata.get("number", None), ) - # def _get(self, name): - # """Private, for testing only""" - # # paramId is renamed as param to get rid of the - # # additional '.128' (in earthkit/data/scripts/grib.py) - # if name == "param": - # name = "paramId" - # return self.handle.get(name) - def write(self, f, bits_per_value=None): r"""Writes the message to a file object. @@ -315,3 +325,14 @@ def message(self): bytes """ return self.handle.get_buffer() + + def _diag(self): + r = r = defaultdict(int) + try: + md_cache = self._metadata.get.cache_info() + r["metadata_cache_hits"] += md_cache.hits + r["metadata_cache_misses"] += md_cache.misses + r["metadata_cache_size"] += md_cache.currsize + except Exception: + pass + return r diff --git a/src/earthkit/data/readers/grib/file.py b/src/earthkit/data/readers/grib/file.py index 9b2bc466..430c92d7 100644 --- a/src/earthkit/data/readers/grib/file.py +++ b/src/earthkit/data/readers/grib/file.py @@ -19,10 +19,23 @@ class GRIBReader(GribFieldListInOneFile, Reader): appendable = True # GRIB messages can be added to the same file def __init__(self, source, path, parts=None): - array_backend = source._kwargs.get("array_backend", None) + + _kwargs = {} + for k in [ + "array_backend", + "grib_field_policy", + "grib_handle_policy", + "grib_handle_cache_size", + "use_grib_metadata_cache", + ]: + _kwargs[k] = source._kwargs.get(k, None) + + for k in source._kwargs: + if "-" in k: + raise KeyError(f"Invalid option {k} in GRIBReader. Option names must not contain '-'.") Reader.__init__(self, source, path) - GribFieldListInOneFile.__init__(self, path, parts=parts, array_backend=array_backend) + GribFieldListInOneFile.__init__(self, path, parts=parts, **_kwargs) def __repr__(self): return "GRIBReader(%s)" % (self.path,) diff --git a/src/earthkit/data/readers/grib/index/__init__.py b/src/earthkit/data/readers/grib/index/__init__.py index b233738b..0ffaa85d 100644 --- a/src/earthkit/data/readers/grib/index/__init__.py +++ b/src/earthkit/data/readers/grib/index/__init__.py @@ -11,6 +11,7 @@ import math import os from abc import abstractmethod +from collections import defaultdict from earthkit.data.core.fieldlist import FieldList from earthkit.data.core.index import MaskIndex @@ -216,15 +217,170 @@ def __init__(self, *args, **kwargs): FieldList._init_from_multi(self, self) +class GribResourceManager: + def __init__( + self, owner, grib_field_policy, grib_handle_policy, grib_handle_cache_size, use_grib_metadata_cache + ): + from earthkit.data.core.settings import SETTINGS + + def _get(v, name): + return v if v is not None else SETTINGS.get(name) + + self.grib_field_policy = _get(grib_field_policy, "grib-field-policy") + self.grib_handle_policy = _get(grib_handle_policy, "grib-handle-policy") + self.grib_handle_cache_size = _get(grib_handle_cache_size, "grib-handle-cache-size") + self.use_grib_metadata_cache = _get(use_grib_metadata_cache, "use-grib-metadata-cache") + + # fields + self.field_cache = None + if self.grib_field_policy == "persistent": + from lru import LRU + + # TODO: the number of fields might only be available only later (e.g. fieldlists with + # an SQL index). Consider making _field_cache a cached property. + n = len(owner) + if n > 0: + self.field_cache = LRU(n) + + # handles + self.handle_cache = None + if self.grib_handle_policy == "cache": + if self.grib_handle_cache_size > 0: + from lru import LRU + + self.handle_cache = LRU(self.grib_handle_cache_size) + else: + raise ValueError( + 'grib_handle_cache_size must be greater than 0 when grib_handle_policy="cache"' + ) + + self.handle_create_count = 0 + self.field_create_count = 0 + + # check consistency + if self.field_cache is not None: + self.grib_field_policy == "persistent" + else: + self.grib_field_policy == "temporary" + + if self.handle_cache is not None: + self.grib_handle_policy == "cache" + else: + self.grib_handle_policy in ["persistent", "temporary"] + + def field(self, n, create): + if self.grib_field_policy == "persistent": + if n in self.field_cache: + return self.field_cache[n] + else: + field = create(n) + self._field_created() + self.field_cache[n] = field + return field + else: + self._field_created() + return create(n) + + def handle(self, field, create): + if self.grib_handle_policy == "cache": + key = (field.path, field._offset) + if key in self.handle_cache: + return self.handle_cache[key] + else: + handle = create() + self._handle_created() + self.handle_cache[key] = handle + return handle + elif self.grib_handle_policy == "persistent": + if field._handle is None: + field._handle = create() + self._handle_created() + return field._handle + elif self.grib_handle_policy == "temporary": + self._handle_created() + return create() + + def _field_created(self): + self.field_create_count += 1 + + def _handle_created(self): + self.handle_create_count += 1 + + def diag(self): + r = defaultdict(int) + r["grib_field_policy"] = self.grib_field_policy + r["grib_handle_policy"] = self.grib_handle_policy + r["grib_handle_cache_size"] = self.grib_handle_cache_size + + if self.field_cache is not None: + r["field_cache_size"] = len(self.field_cache) + + r["field_create_count"] = self.field_create_count + + if self.handle_cache is not None: + r["handle_cache_size"] = len(self.handle_cache) + + r["handle_create_count"] = self.handle_create_count + + if self.field_cache is not None: + for f in self.field_cache.values(): + if f._handle is not None: + r["current_handle_count"] += 1 + + try: + md_cache = f._diag() + for k in ["metadata_cache_hits", "metadata_cache_misses", "metadata_cache_size"]: + r[k] += md_cache[k] + except Exception: + pass + + return r + + class GribFieldListInFiles(GribFieldList): + def __init__( + self, + *args, + grib_field_policy=None, + grib_handle_policy=None, + grib_handle_cache_size=None, + use_grib_metadata_cache=None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self._manager = GribResourceManager( + self, grib_field_policy, grib_handle_policy, grib_handle_cache_size, use_grib_metadata_cache + ) + + def _create_field(self, n): + part = self.part(n) + field = GribField( + part.path, + part.offset, + part.length, + self.array_backend, + manager=self._manager, + ) + if field is None: + raise RuntimeError(f"Could not get a handle for part={part}") + return field + def _getitem(self, n): + # TODO: check if we need a mutex here if isinstance(n, int): - part = self.part(n if n >= 0 else len(self) + n) - return GribField(part.path, part.offset, part.length, self.array_backend) + if n < 0: + n += len(self) + if n >= len(self): + raise IndexError(f"Index {n} out of range") + + return self._manager.field(n, self._create_field) def __len__(self): return self.number_of_parts() + def _diag(self): + return self._manager.diag() + @abstractmethod def part(self, n): self._not_implemented() diff --git a/src/earthkit/data/readers/grib/index/sql.py b/src/earthkit/data/readers/grib/index/sql.py index 88327279..d86c81fd 100644 --- a/src/earthkit/data/readers/grib/index/sql.py +++ b/src/earthkit/data/readers/grib/index/sql.py @@ -68,7 +68,6 @@ def unique_values(self, *coords, remapping=None, progress_bar=None): # print("Not using remapping here") coords = {k: None for k in coords} - coords = self._normalize_kwargs_names(**coords) coords = list(coords.keys()) # print("coords:", coords) values = self.db.unique_values(*coords, remapping=remapping).values() @@ -84,7 +83,6 @@ def filter(self, filter): def sel(self, *args, remapping=None, **kwargs): kwargs = normalize_selection(*args, **kwargs) - kwargs = self._normalize_kwargs_names(**kwargs) if DATETIME in kwargs and kwargs[DATETIME] is not None: kwargs = _normalize_grib_kwargs_values(**kwargs) @@ -94,7 +92,6 @@ def sel(self, *args, remapping=None, **kwargs): def order_by(self, *args, remapping=None, **kwargs): kwargs = normalize_order_by(*args, **kwargs) - kwargs = self._normalize_kwargs_names(**kwargs) out = self diff --git a/src/earthkit/data/readers/grib/memory.py b/src/earthkit/data/readers/grib/memory.py index 8b5d1dd2..38588c90 100644 --- a/src/earthkit/data/readers/grib/memory.py +++ b/src/earthkit/data/readers/grib/memory.py @@ -8,6 +8,7 @@ # import logging +from functools import cached_property import eccodes @@ -15,6 +16,7 @@ from earthkit.data.readers.grib.codes import GribCodesHandle from earthkit.data.readers.grib.codes import GribField from earthkit.data.readers.grib.index import GribFieldList +from earthkit.data.readers.grib.metadata import GribFieldMetadata from earthkit.data.utils.array import ensure_backend LOG = logging.getLogger(__name__) @@ -129,6 +131,10 @@ def handle(self): def offset(self): return None + @cached_property + def _metadata(self): + return GribFieldMetadata(self) + @staticmethod def to_fieldlist(fields): return GribFieldListInMemory.from_fields(fields) diff --git a/src/earthkit/data/readers/grib/metadata.py b/src/earthkit/data/readers/grib/metadata.py index 03028c53..500d8c01 100644 --- a/src/earthkit/data/readers/grib/metadata.py +++ b/src/earthkit/data/readers/grib/metadata.py @@ -9,6 +9,7 @@ import datetime import warnings +from abc import abstractmethod from functools import cached_property from earthkit.data.core.geography import Geography @@ -239,26 +240,11 @@ def longitudes_unrotated(self, **kwargs): class GribMetadata(Metadata): - """Represent the metadata of a GRIB field. - - Parameters - ---------- - handle: :obj:`GribCodesHandle` - Object representing the ecCodes GRIB handle of the field. - - :obj:`GribMetadata` is created internally by a :obj:`GribField` and we can use - the field's :meth:`metadata` method to access it. - - >>> ds = earthkit.data.from_source("file", "docs/examples/test4.grib") - >>> md = ds[0].metadata() - >>> md["shortName"] - 't' - >>> md.get("shortName") - 't' - >>> md.get("nonExistentKey") - >>> md.get("nonExistentKey", 12) - 12 + """GRIB metadata. + :obj:`GribMetadata` is an abstract class and should not be instantiated directly. + There are two concrete implementations: :class:`GribFieldMetadata` and + :class:`StandAloneGribMetadata`. """ LS_KEYS = [ @@ -306,13 +292,14 @@ class GribMetadata(Metadata): __handle_type = None - def __init__(self, handle, extra=None): - if not isinstance(handle, self._handle_type()): - raise TypeError(f"GribMetadata: expected handle type {self._handle_type()}, got {type(handle)}") - self._handle = handle + @abstractmethod + def _handle(self): + pass + + def __init__(self, shrunk=False, **kwargs): self._geo = None - if extra is not None: - self.extra = extra + self._shrunk = shrunk + super().__init__(**kwargs) @staticmethod def _handle_type(): @@ -360,7 +347,7 @@ def _key_name(key): def _is_custom_key(self, key): return key in self.CUSTOM_KEYS - def override(self, *args, **kwargs): + def override(self, *args, headers_only_clone=True, **kwargs): d = dict(*args, **kwargs) new_value_size = None @@ -373,17 +360,27 @@ def override(self, *args, **kwargs): md, new_value_size = GridSpecConverter.to_metadata(gridspec, edition=edition) d.update(md) - handle = self._handle.clone(headers_only=True) - # whether headers_only=True works depends on the eccCodes version and the - # message properties. We check it by comparing the message lengths. - shrunk = handle.get_long("totalLength") < self._handle.get_long("totalLength") - - # some keys, needed later, are not copied into the clone when - # headers_only=True. We store them as extra keys. - if shrunk: - extra = {"bitsPerValue": self._handle.get("bitsPerValue", default=0)} + handle = self._handle.clone(headers_only=headers_only_clone) + + if self._shrunk: + shrunk = True + extra = self.extra + elif headers_only_clone: + # whether headers_only=True works depends on the eccCodes version and the + # message properties. We check it by comparing the message lengths. + shrunk = handle.get_long("totalLength") < self._handle.get_long("totalLength") + + # some keys, needed later, are not copied into the clone when + # headers_only=True. We store them as extra keys. + if shrunk: + extra = {"bitsPerValue": self._handle.get("bitsPerValue", default=0)} + else: + extra = None + else: + shrunk = False - handle.set_multiple(d) + if d: + handle.set_multiple(d) # we need to set the values to the new size otherwise the clone generated # with headers_only=True will be inconsistent @@ -393,7 +390,7 @@ def override(self, *args, **kwargs): vals = np.zeros(new_value_size) handle.set_values(vals) - return GribMetadata(handle, extra=extra) + return StandAloneGribMetadata(handle, extra=extra, shrunk=shrunk) def as_namespace(self, namespace=None): r"""Return all the keys/values from a namespace. @@ -491,13 +488,75 @@ def dump(self, namespace=all, **kwargs): return format_namespace_dump(r, selected="parameter", details=self.__class__.__name__, **kwargs) + +class GribFieldMetadata(GribMetadata): + """Represent the metadata of a GRIB field. + + :obj:`GribFieldMetadata` is created internally by a :obj:`GribField`. It does not + own the ecCodes GRIB handle but can access it through the :obj:`GribField`. + Calling :meth:`metadata` without arguments on a :obj:`GribField` returns this object. + """ + + def __init__(self, field, **kwargs): + self._field = field + assert field is not None + super().__init__(**kwargs) + + @property + def _handle(self): + return self._field.handle + + def _hide_internal_keys(self): + r = self.override() + return RestrictedGribMetadata(r) + + +class StandAloneGribMetadata(GribMetadata): + """Represent standalone GRIB metadata owning an ecCodes GRIB handle. + + :class:`StandAloneGribMetadata` possesses its own ecCodes handle. Calling + :meth:`override` on :obj:`GribMetadata` always returns a + :class:`StandAloneGribMetadata` object. + + >>> ds = earthkit.data.from_source("file", "docs/examples/test4.grib") + >>> md = ds[0].metadata() + >>> md["shortName"] + 't' + >>> md.get("shortName") + 't' + >>> md.get("nonExistentKey") + >>> md.get("nonExistentKey", 12) + 12 + + Examples + -------- + :ref:`/examples/grib_metadata_object.ipynb` + + """ + + def __init__(self, handle, **kwargs): + if not isinstance(handle, self._handle_type()): + raise TypeError(f"GribMetadata: expected handle type {self._handle_type()}, got {type(handle)}") + self.__handle = handle + self._kwargs = dict(**kwargs) + super().__init__(**kwargs) + + @property + def _handle(self): + return self.__handle + def _hide_internal_keys(self): return RestrictedGribMetadata(self) # TODO: this is a temporary solution -class RestrictedGribMetadata(GribMetadata): - """Hide internal keys and namespaces in GRIB metadata""" +class RestrictedGribMetadata(StandAloneGribMetadata): + """Hide internal keys and namespaces in GRIB metadata. + + Examples + -------- + :ref:`/examples/grib_metadata_object.ipynb` + """ EKD_NAMESPACE = "grib" @@ -531,7 +590,8 @@ class RestrictedGribMetadata(GribMetadata): INTERNAL_NAMESPACES = ["statistics"] def __init__(self, md): - super().__init__(md._handle, extra=md.extra) + assert isinstance(md, StandAloneGribMetadata) + super().__init__(md._handle, shrunk=md._shrunk, extra=md.extra) @cached_method def _len(self): diff --git a/src/earthkit/data/readers/netcdf/field.py b/src/earthkit/data/readers/netcdf/field.py index 5cdad0a8..f04176db 100644 --- a/src/earthkit/data/readers/netcdf/field.py +++ b/src/earthkit/data/readers/netcdf/field.py @@ -144,6 +144,8 @@ def __init__(self, field): super().__init__(d) def override(self, *args, **kwargs): + if not args and not kwargs: + return self return None @property @@ -216,7 +218,8 @@ def __repr__(self): + ")" ) - def _make_metadata(self): + @cached_property + def _metadata(self): return XArrayMetadata(self) def to_xarray(self): @@ -272,5 +275,6 @@ class NetCDFMetadata(XArrayMetadata): class NetCDFField(XArrayField): - def _make_metadata(self): + @cached_property + def _metadata(self): return NetCDFMetadata(self) diff --git a/src/earthkit/data/sources/array_list.py b/src/earthkit/data/sources/array_list.py index 2c533659..b56bc82d 100644 --- a/src/earthkit/data/sources/array_list.py +++ b/src/earthkit/data/sources/array_list.py @@ -38,9 +38,6 @@ def __init__(self, array, metadata, array_backend): super().__init__(array_backend, raw_values_backend=array_backend, metadata=metadata) self._array = array - def _make_metadata(self): - pass - def _values(self, dtype=None): """Native array type""" if dtype is None: diff --git a/src/earthkit/data/sources/forcings.py b/src/earthkit/data/sources/forcings.py index c48bf6d5..b351c4ff 100644 --- a/src/earthkit/data/sources/forcings.py +++ b/src/earthkit/data/sources/forcings.py @@ -235,9 +235,6 @@ def __init__(self, maker, date, param, proc, number=None, array_backend=None): metadata=ForcingMetadata(d, self.maker.field.metadata().geography), ) - def _make_metadata(self): - pass - def _values(self, dtype=None): values = self.proc(self.date) if dtype is not None: diff --git a/src/earthkit/data/sources/list_of_dicts.py b/src/earthkit/data/sources/list_of_dicts.py index 6b1d50ea..2e467c00 100644 --- a/src/earthkit/data/sources/list_of_dicts.py +++ b/src/earthkit/data/sources/list_of_dicts.py @@ -179,9 +179,6 @@ def _values(self, dtype=None): else: return v.astype(dtype) - def _make_metadata(self): - pass - class GribFromDicts(GribFieldList): def __init__(self, list_of_dicts, *args, **kwargs): diff --git a/src/earthkit/data/utils/message.py b/src/earthkit/data/utils/message.py index 42f49ce5..735be741 100644 --- a/src/earthkit/data/utils/message.py +++ b/src/earthkit/data/utils/message.py @@ -55,6 +55,9 @@ def check_clone_kwargs(self, **kwargs): kwargs.pop("headers_only", None) return kwargs + def has_Ni_Nj_in_geo_namespace(self): + return self._version >= (2, 37, 0) + @property def versions(self): return f"ecCodes: {self._version} eccodes-python: {self._py_version}" diff --git a/tests/array_fieldlist/array_fl_fixtures.py b/tests/array_fieldlist/array_fl_fixtures.py index f225f14f..4a66264d 100644 --- a/tests/array_fieldlist/array_fl_fixtures.py +++ b/tests/array_fieldlist/array_fl_fixtures.py @@ -31,6 +31,7 @@ def load_array_fl(num, array_backend=None): ds = [] for x in ds_in: + print("len", len(x)) ds.append(FieldList.from_array(x.values, [m.override(edition=1) for m in x.metadata()])) return (*ds, md) diff --git a/tests/array_fieldlist/test_numpy_fs_metadata.py b/tests/array_fieldlist/test_numpy_fs_metadata.py index 5c5957a3..64abd89c 100644 --- a/tests/array_fieldlist/test_numpy_fs_metadata.py +++ b/tests/array_fieldlist/test_numpy_fs_metadata.py @@ -23,27 +23,27 @@ # See grib/test_grib_metadata.py -def test_array_fl_values_metadata(): +def test_array_fl_values_metadata_basic(): ds, _ = load_array_fl(1) # values metadata - keys = [ - "min", - "max", - "avg", - "ds", - "skew", - "kurt", - "isConstant", - "const", - "bitmapPresent", - "numberOfMissing", - "values", - ] - for k in keys: - assert ds[0].metadata(k, default=None) is None, k - with pytest.raises(KeyError): - ds[0].metadata(k) + # keys = [ + # "min", + # "max", + # "avg", + # "ds", + # "skew", + # "kurt", + # "isConstant", + # "const", + # "bitmapPresent", + # "numberOfMissing", + # "values", + # ] + # for k in keys: + # assert ds[0].metadata(k, default=None) is None, k + # with pytest.raises(KeyError): + # ds[0].metadata(k) # bits per value must be kept from the original GRIB data assert ds[0].metadata("bitsPerValue") == 16 diff --git a/tests/array_fieldlist/test_numpy_fs_summary.py b/tests/array_fieldlist/test_numpy_fs_summary.py index 9fcf6258..d1f5ec6a 100644 --- a/tests/array_fieldlist/test_numpy_fs_summary.py +++ b/tests/array_fieldlist/test_numpy_fs_summary.py @@ -121,6 +121,12 @@ def test_array_fl_dump(): }, ] + from earthkit.data.utils.message import ECC_FEATURES + + if not ECC_FEATURES.has_Ni_Nj_in_geo_namespace(): + ref[1]["data"].pop("Ni") + ref[1]["data"].pop("Nj") + assert len(r) == len(namespaces) assert isinstance(r, list) for d in r: diff --git a/tests/core/test_metadata.py b/tests/core/test_metadata.py index 7b342643..14584a8b 100644 --- a/tests/core/test_metadata.py +++ b/tests/core/test_metadata.py @@ -9,11 +9,14 @@ # nor does it submit to any jurisdiction. # +import numpy as np import pytest from earthkit.data import from_source from earthkit.data.core.metadata import RawMetadata -from earthkit.data.readers.grib.metadata import GribMetadata +from earthkit.data.readers.grib.metadata import GribFieldMetadata +from earthkit.data.readers.grib.metadata import RestrictedGribMetadata +from earthkit.data.readers.grib.metadata import StandAloneGribMetadata from earthkit.data.testing import earthkit_examples_file from earthkit.data.testing import earthkit_test_data_file @@ -104,17 +107,21 @@ def test_raw_metadata_override_with_kwarg(): def test_grib_metadata_create(): f = from_source("file", earthkit_examples_file("test.grib")) - md = f[0].metadata() - assert isinstance(md, GribMetadata) + + f0 = f[0] + md = f0.metadata() + assert isinstance(md, GribFieldMetadata) + assert md._handle is not None + assert md._handle == f0.handle # cannot create from dict with pytest.raises(TypeError): - GribMetadata({"shortName": "u", "typeOfLevel": "pl", "levelist": 1000}) + StandAloneGribMetadata({"shortName": "u", "typeOfLevel": "pl", "levelist": 1000}) # cannot create from raw metadata raw_md = RawMetadata({"shortName": "u", "typeOfLevel": "pl", "levelist": 1000}) with pytest.raises(TypeError): - GribMetadata(raw_md) + StandAloneGribMetadata(raw_md) def test_grib_metadata_get(): @@ -265,7 +272,7 @@ def test_grib_metadata_override_extra(): assert md["shortName"] == "2t" extra = {"my_custom_key": "2", "shortName": "N", "perturbationNumber": 2} - md = GribMetadata(md._handle, extra=extra) + md = StandAloneGribMetadata(md._handle, extra=extra) assert md["my_custom_key"] == "2" assert md["perturbationNumber"] == 2 @@ -290,6 +297,50 @@ def test_grib_metadata_override_extra(): break +def test_grib_metadata_override_headers_only_true(): + ds = from_source("file", earthkit_examples_file("test.grib")) + ref_size = ds[0].metadata("totalLength") + + md1 = ds[0].metadata().override(headers_only_clone=True) + assert isinstance(md1, StandAloneGribMetadata) + assert md1._handle is not None + assert md1._handle != ds[0]._handle + assert md1["totalLength"] - ref_size < -10 + assert md1._shrunk + + md2 = md1._hide_internal_keys() + assert isinstance(md2, RestrictedGribMetadata) + assert md2._handle is not None + assert md2._handle != ds[0]._handle + assert md2._handle == md1._handle + assert md2._shrunk + + with pytest.raises(KeyError): + md2["average"] + + +def test_grib_metadata_override_headers_only_false(): + ds = from_source("file", earthkit_examples_file("test.grib")) + ref_size = ds[0].metadata("totalLength") + + md1 = ds[0].metadata().override(headers_only_clone=False) + assert isinstance(md1, StandAloneGribMetadata) + assert md1._handle is not None + assert md1._handle != ds[0]._handle + assert np.isclose(md1["totalLength"], ref_size) + assert not md1._shrunk + + md2 = md1._hide_internal_keys() + assert isinstance(md2, RestrictedGribMetadata) + assert md2._handle is not None + assert md2._handle != ds[0]._handle + assert md2._handle == md1._handle + assert not md2._shrunk + + with pytest.raises(KeyError): + md2["average"] + + if __name__ == "__main__": from earthkit.data.testing import main diff --git a/tests/core/test_remapping.py b/tests/core/test_remapping.py new file mode 100644 index 00000000..968c8903 --- /dev/null +++ b/tests/core/test_remapping.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import pytest + +from earthkit.data.core.order import build_remapping + +data = {"type": "cf", "number": 0, "name": "t2m"} + + +def fn(key): + return data[key] + + +@pytest.mark.parametrize( + "remapping,expected_values", + [ + ({}, {"type": "cf", "number": 0, "name": "t2m"}), + (None, {"type": "cf", "number": 0, "name": "t2m"}), + ({"type": "pf"}, {"type": "pf", "number": 0, "name": "t2m"}), + ({"type": "{type}{number}"}, {"type": "cf0", "name": "t2m"}), + ({"type": "{type}_{number}"}, {"type": "cf_0", "name": "t2m"}), + ({"type": lambda: 12}, {"type": 12, "name": "t2m"}), + ], +) +def test_remapping_dict(remapping, expected_values): + r = build_remapping(remapping) + _fn = r(fn) + + for k, v in expected_values.items(): + assert _fn(k) == v + + +def test_remapping_repeated(): + remapping = {"type": "{type}{number}"} + r = build_remapping(remapping) + r = build_remapping(r) + _fn = r(fn) + + assert _fn("type") == "cf0" + + +@pytest.mark.parametrize( + "remapping,patch,expected_values", + [ + ({}, {"type": {"cf": "pf"}, "number": 12, "name": None}, {"type": "pf", "number": 12, "name": None}), + ({"type": "{type}{number}"}, {"type": {"cf0": "x"}}, {"type": "x", "number": 0, "name": "t2m"}), + ({}, {"type": lambda x: "x"}, {"type": "x", "number": 0, "name": "t2m"}), + ({}, {"type": True}, {"type": True, "number": 0, "name": "t2m"}), + (None, {"type": True}, {"type": True, "number": 0, "name": "t2m"}), + ], +) +def test_remapping_patch(remapping, patch, expected_values): + r = build_remapping(remapping, patch) + _fn = r(fn) + + for k, v in expected_values.items(): + assert _fn(k) == v + + +def test_remapping_patch_repeated(): + remapping = {"type": "{type}{number}"} + patch = {"type": {"cf0": "x"}} + r = build_remapping(remapping, patch) + r = build_remapping(r) + _fn = r(fn) + + assert _fn("type") == "x" diff --git a/tests/environment-unit-tests.yml b/tests/environment-unit-tests.yml index 5b72f25c..d3de8854 100644 --- a/tests/environment-unit-tests.yml +++ b/tests/environment-unit-tests.yml @@ -33,6 +33,7 @@ dependencies: - eccovjson>=0.0.5 - earthkit-geo>=0.2.0 - tqdm>=4.63.0 +- lru-dict - markdown - make - mypy diff --git a/tests/grib/test_grib_cache.py b/tests/grib/test_grib_cache.py new file mode 100644 index 00000000..fd9c255f --- /dev/null +++ b/tests/grib/test_grib_cache.py @@ -0,0 +1,661 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import pytest + +from earthkit.data import from_source +from earthkit.data import settings +from earthkit.data.testing import earthkit_examples_file + + +def _check_diag(diag, ref): + for k, v in ref.items(): + assert diag[k] == v, f"{k}={diag[k]} != {v}" + + +@pytest.mark.parametrize("handle_cache_size", [1, 5]) +def test_grib_cache_basic(handle_cache_size): + + with settings.temporary( + { + "grib-field-policy": "persistent", + "grib-handle-policy": "cache", + "grib-handle-cache-size": handle_cache_size, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ref_vals = ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + diag = ds._diag() + ref = { + "field_cache_size": 18, + "field_create_count": 18, + "handle_cache_size": handle_cache_size, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 18 * 6, + "metadata_cache_size": 18 * 6, + } + _check_diag(ds._diag(), ref) + + for i, f in enumerate(ds): + assert i in cache.field_cache, f"{i} not in cache" + assert id(f) == id(cache.field_cache[i]), f"{i} not the same object" + + _check_diag(ds._diag(), ref) + + # unique values repeated + vals = ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + assert vals == ref_vals + + ref = { + "field_cache_size": 18, + "field_create_count": 18, + "handle_cache_size": handle_cache_size, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_hits": 18 * 4, + "metadata_cache_misses": 18 * 6, + "metadata_cache_size": 18 * 6, + } + _check_diag(ds._diag(), ref) + + # order by + ds.order_by(["levelist", "valid_datetime", "paramId", "levtype"]) + diag = ds._diag() + ref = { + "field_cache_size": 18, + "field_create_count": 18, + "handle_cache_size": handle_cache_size, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_misses": 18 * 6, + "metadata_cache_size": 18 * 6, + } + _check_diag(ds._diag(), ref) + + assert diag["metadata_cache_hits"] >= 18 * 4 + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0].handle == md._handle + + +def test_grib_cache_options_1(): + with settings.temporary( + { + "grib-field-policy": "persistent", + "grib-handle-policy": "temporary", + "grib-handle-cache-size": 1, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + assert cache.field_cache is not None + assert cache.handle_cache is None + + ref = { + "field_cache_size": 18, + "field_create_count": 18, + "handle_create_count": 18 * 5, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 18 * 6, + "metadata_cache_size": 18 * 6, + } + + _check_diag(ds._diag(), ref) + + for i, f in enumerate(ds): + assert i in cache.field_cache, f"{i} not in cache" + assert id(f) == id(cache.field_cache[i]), f"{i} not the same object" + + _check_diag(ds._diag(), ref) + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0]._handle is None + + # keep a reference to the field + first = ds[0] + md = first.metadata() + assert md._field == first + assert first._handle is None + + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 6, "metadata_cache_size": 6} + ) + + # key already cached + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 6, "metadata_cache_size": 6} + ) + + ref["metadata_cache_hits"] += 1 + _check_diag(ds._diag(), ref) + + # uncached key + first.metadata("level") + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 7, "metadata_cache_size": 7} + ) + + ref["handle_create_count"] += 1 + ref["metadata_cache_misses"] += 1 + ref["metadata_cache_size"] += 1 + _check_diag(ds._diag(), ref) + + assert first.handle != md._handle + + ref["handle_create_count"] += 2 + _check_diag(ds._diag(), ref) + + +def test_grib_cache_options_2(): + with settings.temporary( + { + "grib-field-policy": "persistent", + "grib-handle-policy": "persistent", + "grib-handle-cache-size": 1, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + assert cache.field_cache is not None + assert cache.handle_cache is None + + ref = { + "field_cache_size": 18, + "field_create_count": 18, + "handle_create_count": 18, + "current_handle_count": 18, + "metadata_cache_hits": 0, + "metadata_cache_misses": 18 * 6, + "metadata_cache_size": 18 * 6, + } + + _check_diag(ds._diag(), ref) + + for i, f in enumerate(ds): + assert i in cache.field_cache, f"{i} not in cache" + assert id(f) == id(cache.field_cache[i]), f"{i} not the same object" + + _check_diag(ds._diag(), ref) + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0]._handle is not None + + # keep a reference to the field + first = ds[0] + md = first.metadata() + assert md._field == first + assert first._handle is not None + assert first._handle == md._handle + assert first.handle == first._handle + + _check_diag(ds._diag(), ref) + + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 6, "metadata_cache_size": 6} + ) + + # key already cached + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 6, "metadata_cache_size": 6} + ) + + ref["metadata_cache_hits"] += 1 + _check_diag(ds._diag(), ref) + + # uncached key + first.metadata("level") + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 7, "metadata_cache_size": 7} + ) + + ref["metadata_cache_misses"] += 1 + ref["metadata_cache_size"] += 1 + _check_diag(ds._diag(), ref) + + assert first.handle == md._handle + + _check_diag(ds._diag(), ref) + + +def test_grib_cache_options_3(): + with settings.temporary( + { + "grib-field-policy": "persistent", + "grib-handle-policy": "cache", + "grib-handle-cache-size": 1, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + assert cache.field_cache is not None + assert cache.handle_cache is not None + + ref = { + "field_cache_size": 18, + "field_create_count": 18, + "handle_cache_size": 1, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 18 * 6, + "metadata_cache_size": 18 * 6, + } + + _check_diag(ds._diag(), ref) + + for i, f in enumerate(ds): + assert i in cache.field_cache, f"{i} not in cache" + assert id(f) == id(cache.field_cache[i]), f"{i} not the same object" + + _check_diag(ds._diag(), ref) + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0]._handle is None + + # keep a reference to the field + first = ds[0] + md = first.metadata() + assert md._field == first + assert first._handle is None + + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 6, "metadata_cache_size": 6} + ) + + # key already cached + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 6, "metadata_cache_size": 6} + ) + + ref["metadata_cache_hits"] += 1 + _check_diag(ds._diag(), ref) + + # uncached key + first.metadata("level") + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 7, "metadata_cache_size": 7} + ) + + ref["handle_create_count"] += 1 + ref["metadata_cache_misses"] += 1 + ref["metadata_cache_size"] += 1 + _check_diag(ds._diag(), ref) + + assert first.handle == md._handle + + _check_diag(ds._diag(), ref) + + +def test_grib_cache_options_4(): + with settings.temporary( + { + "grib-field-policy": "temporary", + "grib-handle-policy": "temporary", + "grib-handle-cache-size": 1, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + ref = { + "field_cache_size": 0, + "field_create_count": 18, + "handle_cache_size": 0, + "handle_create_count": 18 * 5, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 0, + "metadata_cache_size": 0, + } + + _check_diag(ds._diag(), ref) + + assert cache.field_cache is None + assert cache.handle_cache is None + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0]._handle != md._handle + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + # keep a reference to the field + first = ds[0] + md = first.metadata() + assert md._field == first + assert first._handle is None + ref["field_create_count"] += 1 + _check_diag(ds._diag(), ref) + + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 1, "metadata_cache_size": 1} + ) + + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 1, "metadata_cache_size": 1} + ) + + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + # repeat with indexed field + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + + ref["field_create_count"] += 1 + _check_diag(ds._diag(), ref) + + ds[0].metadata("levelist", default=None) + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + ds[0].metadata("levelist", default=None) + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + +def test_grib_cache_options_5(): + with settings.temporary( + { + "grib-field-policy": "temporary", + "grib-handle-policy": "persistent", + "grib-handle-cache-size": 1, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + ref = { + "field_cache_size": 0, + "field_create_count": 18, + "handle_cache_size": 0, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 0, + "metadata_cache_size": 0, + } + + _check_diag(ds._diag(), ref) + + assert cache.field_cache is None + assert cache.handle_cache is None + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0]._handle != md._handle + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + # keep a reference to the field + first = ds[0] + md = first.metadata() + assert md._field == first + assert first._handle is None + ref["field_create_count"] += 1 + _check_diag(ds._diag(), ref) + + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 1, "metadata_cache_size": 1} + ) + + assert first._handle is not None + + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 1, "metadata_cache_size": 1} + ) + + assert first._handle is not None + + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + # repeat with indexed field + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + + ref["field_create_count"] += 1 + _check_diag(ds._diag(), ref) + + ds[0].metadata("levelist", default=None) + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + ds[0].metadata("levelist", default=None) + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + +def test_grib_cache_options_6(): + with settings.temporary( + { + "grib-field-policy": "temporary", + "grib-handle-policy": "cache", + "grib-handle-cache-size": 1, + "use-grib-metadata-cache": True, + } + ): + ds = from_source("file", earthkit_examples_file("tuv_pl.grib")) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + ref = { + "field_cache_size": 0, + "field_create_count": 18, + "handle_cache_size": 1, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 0, + "metadata_cache_size": 0, + } + + _check_diag(ds._diag(), ref) + + assert cache.field_cache is None + assert cache.handle_cache is not None + + # metadata object is not decoupled from the field object + md = ds[0].metadata() + assert hasattr(md, "_field") + assert ds[0]._handle != md._handle + ref["field_create_count"] += 2 + ref["handle_create_count"] += 1 + _check_diag(ds._diag(), ref) + + # keep a reference to the field + first = ds[0] + md = first.metadata() + assert md._field == first + assert first._handle is None + ref["field_create_count"] += 1 + _check_diag(ds._diag(), ref) + + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 1, "metadata_cache_size": 1} + ) + + first.metadata("levelist", default=None) + _check_diag( + first._diag(), {"metadata_cache_hits": 1, "metadata_cache_misses": 1, "metadata_cache_size": 1} + ) + + _check_diag(ds._diag(), ref) + + # repeat with indexed field + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + + ref["field_create_count"] += 1 + _check_diag(ds._diag(), ref) + + ds[0].metadata("levelist", default=None) + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + ref["field_create_count"] += 2 + _check_diag(ds._diag(), ref) + + ds[0].metadata("levelist", default=None) + _check_diag( + ds[0]._diag(), {"metadata_cache_hits": 0, "metadata_cache_misses": 0, "metadata_cache_size": 0} + ) + ref["field_create_count"] += 2 + _check_diag(ds._diag(), ref) + + +def test_grib_cache_use_kwargs_1(): + _kwargs = { + "grib_field_policy": "temporary", + "grib_handle_policy": "persistent", + "grib_handle_cache_size": 1, + "use_grib_metadata_cache": True, + } + + ds = from_source("file", earthkit_examples_file("tuv_pl.grib"), **_kwargs) + assert len(ds) == 18 + + cache = ds._manager + assert cache + + # unique values + ds.unique_values("paramId", "levelist", "levtype", "valid_datetime") + + ref = { + "field_cache_size": 0, + "field_create_count": 18, + "handle_cache_size": 0, + "handle_create_count": 18, + "current_handle_count": 0, + "metadata_cache_hits": 0, + "metadata_cache_misses": 0, + "metadata_cache_size": 0, + } + + _check_diag(ds._diag(), ref) + + +def test_grib_cache_use_kwargs_2(): + _kwargs = { + "grib-field-policy": "temporary", + "grib_handle_policy": "persistent", + "grib_handle_cache_size": 1, + "use_grib_metadata_cache": True, + } + + with pytest.raises(KeyError): + from_source("file", earthkit_examples_file("tuv_pl.grib"), **_kwargs) diff --git a/tests/grib/test_grib_inidces.py b/tests/grib/test_grib_inidces.py index ffbb8ad8..37d722a2 100644 --- a/tests/grib/test_grib_inidces.py +++ b/tests/grib/test_grib_inidces.py @@ -27,7 +27,7 @@ def test_grib_indices_base(fl_type, array_backend): ds = load_grib_data("tuv_pl.grib", fl_type, array_backend) - ref = { + ref_full = { "class": ["od"], "stream": ["oper"], "levtype": ["pl"], @@ -42,7 +42,7 @@ def test_grib_indices_base(fl_type, array_backend): } r = ds.indices() - assert r == ref + assert r == ref_full ref = { "levelist": [300, 400, 500, 700, 850, 1000], @@ -55,6 +55,15 @@ def test_grib_indices_base(fl_type, array_backend): r = ds.index("param") assert r == ref + ref = [300, 400, 500, 700, 850, 1000] + ref_full["level"] = ref + + r = ds.index("level") + assert r == ref + + r = ds.indices() + assert r == ref_full + @pytest.mark.parametrize("fl_type", FL_TYPES) @pytest.mark.parametrize("array_backend", ARRAY_BACKENDS) @@ -155,7 +164,7 @@ def test_grib_indices_multi(fl_type, array_backend): @pytest.mark.parametrize("fl_type", FL_TYPES) @pytest.mark.parametrize("array_backend", ARRAY_BACKENDS) -def test_grib_indices_multi_Del(fl_type, array_backend): +def test_grib_indices_multi_sel(fl_type, array_backend): f1 = load_grib_data("tuv_pl.grib", fl_type, array_backend) f2 = load_grib_data("ml_data.grib", fl_type, array_backend, folder="data") ds = f1 + f2 diff --git a/tests/grib/test_grib_summary.py b/tests/grib/test_grib_summary.py index 2dc87cf8..a75f06b2 100644 --- a/tests/grib/test_grib_summary.py +++ b/tests/grib/test_grib_summary.py @@ -475,6 +475,12 @@ def test_grib_dump(fl_type, array_backend): }, ] + from earthkit.data.utils.message import ECC_FEATURES + + if not ECC_FEATURES.has_Ni_Nj_in_geo_namespace(): + ref[1]["data"].pop("Ni") + ref[1]["data"].pop("Nj") + assert len(r) == len(namespaces) assert isinstance(r, list) for d in r: diff --git a/tests/grib/test_grib_values.py b/tests/grib/test_grib_values.py index 716e77d3..14faac06 100644 --- a/tests/grib/test_grib_values.py +++ b/tests/grib/test_grib_values.py @@ -249,6 +249,123 @@ def test_grib_to_numpy_18_dtype(fl_type, array_backend, dtype): assert v.dtype == dtype +@pytest.mark.parametrize("fl_type", FL_TYPES) +@pytest.mark.parametrize("array_backend", ARRAY_BACKENDS) +def test_grib_to_numpy_1_index(fl_type, array_backend): + ds = load_grib_data("test_single.grib", fl_type, array_backend, folder="data") + + eps = 1e-5 + + v = ds[0].to_numpy(flatten=True, index=[0, -1]) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (2,) + assert np.allclose(v, [260.43560791015625, 227.18560791015625]) + + v = ds[0].to_numpy(flatten=True, index=slice(None, None)) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + check_array( + v, + (84,), + first=260.43560791015625, + last=227.18560791015625, + meanv=274.36566743396577, + eps=eps, + ) + + v = ds[0].to_numpy(index=(slice(None, 2), slice(None, 3))) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (2, 3) + assert np.allclose( + v, + [ + [260.43560791, 260.43560791, 260.43560791], + [280.81060791, 277.06060791, 284.43560791], + ], + ) + + +@pytest.mark.parametrize("fl_type", FL_TYPES) +@pytest.mark.parametrize("array_backend", ARRAY_BACKENDS) +def test_grib_to_numpy_18_index(fl_type, array_backend): + ds = load_grib_data("tuv_pl.grib", fl_type, array_backend) + + eps = 1e-5 + + v = ds.to_numpy(flatten=True, index=[0, -1]) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18, 2) + vf0 = v[0].flatten() + assert np.allclose(vf0, [272.5642, 240.56417846679688]) + vf15 = v[15].flatten() + assert np.allclose(vf15, [226.6531524658203, 206.6531524658203]) + + v = ds.to_numpy(flatten=True, index=slice(None, 2)) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18, 2) + vf0 = v[0].flatten() + assert np.allclose(vf0, [272.56417847, 272.56417847]) + vf15 = v[15].flatten() + assert np.allclose(vf15, [226.65315247, 226.65315247]) + + v = ds.to_numpy(flatten=True, index=slice(None, None)) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18, 84) + vf0 = v[0].flatten() + check_array( + vf0, + (84,), + first=272.5642, + last=240.56417846679688, + meanv=279.70703560965404, + eps=eps, + ) + + vf15 = v[15].flatten() + check_array( + vf15, + (84,), + first=226.6531524658203, + last=206.6531524658203, + meanv=227.84362865629652, + eps=eps, + ) + + v = ds.to_numpy(index=(slice(None, 2), slice(None, 3))) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18, 2, 3) + vf0 = v[0].flatten() + assert np.allclose( + vf0, + [ + 272.56417847, + 272.56417847, + 272.56417847, + 288.56417847, + 296.56417847, + 288.56417847, + ], + ) + vf15 = v[15].flatten() + assert np.allclose( + vf15, + [ + 226.65315247, + 226.65315247, + 226.65315247, + 230.65315247, + 230.65315247, + 230.65315247, + ], + ) + + @pytest.mark.parametrize("fl_type", FL_TYPES) @pytest.mark.parametrize("array_backend", ["numpy"]) @pytest.mark.parametrize( @@ -354,6 +471,105 @@ def test_grib_fieldlist_data(fl_type, array_backend, kwarg, expected_shape, expe assert np.allclose(d[2], latlon["lon"]) +@pytest.mark.parametrize("fl_type", FL_TYPES) +@pytest.mark.parametrize("array_backend", ["numpy"]) +def test_grib_fieldlist_data_index(fl_type, array_backend): + ds = load_grib_data("tuv_pl.grib", fl_type, array_backend) + + eps = 1e-5 + + latlon = ds.to_latlon(flatten=True) + lat = latlon["lat"] + lon = latlon["lon"] + + index = [0, -1] + v = ds.data(flatten=True, index=index) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18 + 2, 2) + assert np.allclose(v[0].flatten(), lat[index]) + assert np.allclose(v[1].flatten(), lon[index]) + vf0 = v[2 + 0].flatten() + assert np.allclose(vf0, [272.5642, 240.56417846679688]) + vf15 = v[2 + 15].flatten() + assert np.allclose(vf15, [226.6531524658203, 206.6531524658203]) + + index = slice(None, 2) + v = ds.data(flatten=True, index=index) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18 + 2, 2) + assert np.allclose(v[0].flatten(), lat[index]) + assert np.allclose(v[1].flatten(), lon[index]) + vf0 = v[2 + 0].flatten() + assert np.allclose(vf0, [272.56417847, 272.56417847]) + vf15 = v[2 + 15].flatten() + assert np.allclose(vf15, [226.65315247, 226.65315247]) + + index = slice(None, None) + v = ds.data(flatten=True, index=index) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (18 + 2, 84) + assert np.allclose(v[0].flatten(), lat) + assert np.allclose(v[1].flatten(), lon) + vf0 = v[2 + 0].flatten() + check_array( + vf0, + (84,), + first=272.5642, + last=240.56417846679688, + meanv=279.70703560965404, + eps=eps, + ) + + vf15 = v[2 + 15].flatten() + check_array( + vf15, + (84,), + first=226.6531524658203, + last=206.6531524658203, + meanv=227.84362865629652, + eps=eps, + ) + + index = (slice(None, 2), slice(None, 3)) + v = ds.data(index=index) + assert isinstance(v, np.ndarray) + assert v.dtype == np.float64 + assert v.shape == (2 + 18, 2, 3) + latlon = ds.to_latlon() + lat = latlon["lat"] + lon = latlon["lon"] + assert np.allclose(v[0], lat[index]) + assert np.allclose(v[1], lon[index]) + + vf0 = v[2 + 0].flatten() + assert np.allclose( + vf0, + [ + 272.56417847, + 272.56417847, + 272.56417847, + 288.56417847, + 296.56417847, + 288.56417847, + ], + ) + vf15 = v[2 + 15].flatten() + assert np.allclose( + vf15, + [ + 226.65315247, + 226.65315247, + 226.65315247, + 230.65315247, + 230.65315247, + 230.65315247, + ], + ) + + @pytest.mark.parametrize("fl_type", FL_TYPES) @pytest.mark.parametrize("array_backend", ARRAY_BACKENDS) def test_grib_values_with_missing(fl_type, array_backend):