diff --git a/experimental/dataset_search_agent/scienceai_ee_dataset_search_embeddings_generation_v0.ipynb b/experimental/dataset_search_agent/scienceai_ee_dataset_search_embeddings_generation_v0.ipynb new file mode 100644 index 000000000..b49b26b2a --- /dev/null +++ b/experimental/dataset_search_agent/scienceai_ee_dataset_search_embeddings_generation_v0.ipynb @@ -0,0 +1,2247 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j9jYuqKak7vM" + }, + "outputs": [], + "source": [ + "#@title Copyright 2024 The Earth Engine Community Authors { display-mode: \"form\" }\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gu5qMojEL8do" + }, + "source": [ + "# Earth Engine Dataset Search Embeddings\n", + "\n", + "## Overview\n", + "This notebook, built by the Science AI team in Google Research, is intended to supplement our main [Dataset Search Agent](https://github.com/google/earthengine-community/blob/master/experimental/scienceai_ee_dataset_search_agent/scienceai_ee_dataset_search_agent_v0.ipynb) by demonstrating how the dataset summaries and embeddings were generated for use in the main Dataset Search notebook. For more details on the project as a whole, see the main notebook, or the [README](https://github.com/google/earthengine-community/blob/master/experimental/scienceai_ee_dataset_search_agent/README.md).\n", + "\n", + "The notebook uses:\n", + "\n", + " - The [Gemini 1.5 Pro language model](https://blog.google/technology/ai/gemini-1-5/) to create concise summaries of Earth Engine dataset descriptions.\n", + " - [Google Text Embedding API](https://cloud.google.com/natural-language/docs/embedding-overview) to generate vector representations of these summaries. These embeddings are then uploaded to a Google Cloud Storage bucket for use in downstream applications, such as the [Dataset Search Agent](https://github.com/google/earthengine-community/blob/master/experimental/scienceai_ee_dataset_search_agent/scienceai_ee_dataset_search_agent_v0.ipynb).\n", + "\n", + "\n", + "## Setup Details and Billing\n", + "\n", + "You will need:\n", + "\n", + "- A Google cloud project with the Earth Engine API enabled. ([Details](https://developers.google.com/earth-engine/cloud/earthengine_cloud_project_setup)).\n", + "- A Gemini API key. ([Details](https://ai.google.dev/gemini-api/docs/api-key)).\n", + "- (Optionally) A predefined Google Cloud Storage (GCS) 'bucket'. ([Details](https://cloud.google.com/storage/docs/buckets)).\n", + "\n", + "Each of the above can be stored in the [colab \"Secrets\" panel](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75). Add the following strings as secrets:\n", + "\n", + " - Use `GOOGLE_PROJECT_ID` for the Cloud project id.\n", + " - Use `GOOGLE_API_KEY` for the Gemini API key\n", + " - Use `DESTINATION_BUCKET` for the GCS bucket where you want to upload embeddings.\n", + "\n", + "## Caveats\n", + "\n", + " - This is an early prototype, bugs and unexpected behavior are likely. Code improvements and refactors to follow.\n", + "\n", + " - Currently the notebook uses Langchain for some of the dataset summarization \"glue\", but this will likely change in a future version.\n", + "\n", + " - The very lightweight use of the TextEmbedding API from VertexAI requires billing to be enabled in your Cloud project. It should be an extremely minimal expense. ([Details](https://cloud.google.com/vertex-ai/generative-ai/pricing)).\n", + "\n", + " - For assistance, please email scienceai_ee_dataset_search_agent@googlegroups.com." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "XMTSCU9qKxgj" + }, + "outputs": [], + "source": [ + "#@title Install Python Libraries\n", + "\n", + "%%capture\n", + "!pip install google_cloud_aiplatform langchain-community langchain_google_genai langchain iso8601" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "iq7d8vqEKxgk" + }, + "outputs": [], + "source": [ + "# Standard library imports\n", + "import dataclasses\n", + "import datetime\n", + "import json\n", + "import logging\n", + "import os\n", + "import re\n", + "import time\n", + "from concurrent import futures\n", + "from functools import partial\n", + "from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence\n", + "\n", + "# Third-party imports\n", + "import iso8601\n", + "import pandas as pd\n", + "import tenacity\n", + "import tqdm\n", + "import vertexai\n", + "from IPython.display import HTML, display, clear_output\n", + "from google.api_core import exceptions as google_exceptions\n", + "from google.cloud import storage\n", + "from google.colab import userdata\n", + "from langchain.chains.summarize import load_summarize_chain\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain_core.language_models.base import BaseLanguageModel\n", + "from langchain_google_genai import ChatGoogleGenerativeAI\n", + "from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, wait_fixed\n", + "from vertexai.preview.language_models import TextEmbeddingModel\n", + "\n", + "# Specific exception imports\n", + "from google.api_core.exceptions import ResourceExhausted\n", + "\n", + "# Concurrent processing imports\n", + "from concurrent.futures import ThreadPoolExecutor, as_completed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "58eOcl7YKLT2" + }, + "outputs": [], + "source": [ + "#@title Setup\n", + "project_name = userdata.get('GOOGLE_PROJECT_ID')\n", + "vertex_ai_zone = \"us-central1\"\n", + "\n", + "storage_client = storage.Client(project=project_name)\n", + "from google.colab import auth\n", + "auth.authenticate_user()\n", + "vertexai.init(project=project_name, location=vertex_ai_zone)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZU8LnyaAwG7s" + }, + "source": [ + "# Define classes for working with the Earth Engine data catalog\n", + "\n", + "These will soon be broken up into their own files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "6f0u0IX8vmRF" + }, + "outputs": [], + "source": [ + "#@title Helper methods\n", + "def matches_interval(\n", + " collection_interval: tuple[datetime.datetime, datetime.datetime],\n", + " query_interval: tuple[datetime.datetime, datetime.datetime],\n", + "):\n", + " \"\"\"Checks if the collection's datetime interval matches the query datetime interval.\n", + "\n", + " Args:\n", + " collection_interval: Temporal interval of the collection.\n", + " query_interval: a tuple with the query interval start and end\n", + "\n", + " Returns:\n", + " True if the datetime interval matches\n", + " \"\"\"\n", + " start_query, end_query = query_interval\n", + " start_collection, end_collection = collection_interval\n", + " if end_collection is None:\n", + " # End date should always be set in STAC JSON files, but just in case...\n", + " end_collection = datetime.datetime.now(tz=datetime.UTC)\n", + " return end_query \u003e start_collection and start_query \u003c= end_collection\n", + "\n", + "\n", + "\n", + "def matches_datetime(\n", + " collection_interval: tuple[datetime.datetime, Optional[datetime.datetime]],\n", + " query_datetime: datetime.datetime,\n", + "):\n", + " \"\"\"Checks if the collection's datetime interval matches the query datetime.\n", + "\n", + " Args:\n", + " collection_interval: Temporal interval of the collection.\n", + " query_datetime: a datetime coming from a query\n", + "\n", + " Returns:\n", + " True if the datetime interval matches\n", + " \"\"\"\n", + " if collection_interval[1] is None:\n", + " # End date should always be set in STAC JSON files, but just in case...\n", + " end_date = datetime.datetime.now(tz=datetime.UTC)\n", + " else:\n", + " end_date = collection_interval[1]\n", + " return collection_interval[0] \u003c= query_datetime \u003c= end_date" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "3SG0ahqKJBYW" + }, + "outputs": [], + "source": [ + "# @title class BBox()\n", + "@dataclasses.dataclass\n", + "class BBox:\n", + " \"\"\"Class representing a lat/lon bounding box.\"\"\"\n", + " west: float\n", + " south: float\n", + " east: float\n", + " north: float\n", + "\n", + " def is_global(self) -\u003e bool:\n", + " return (\n", + " self.west == -180 and self.south == -90 and\n", + " self.east == 180 and self.north == 90)\n", + "\n", + " @classmethod\n", + " def from_list(cls, bbox_list: list[float]):\n", + " \"\"\"Constructs a BBox from a list of four numbers [west,south,east,north].\"\"\"\n", + " if bbox_list[0] \u003e bbox_list[2]:\n", + " raise ValueError(\n", + " 'The smaller (west) coordinate must be listed first in a bounding box'\n", + " f' corner list. Found {bbox_list}'\n", + " )\n", + " if bbox_list[1] \u003e bbox_list[3]:\n", + " raise ValueError(\n", + " 'The smaller (south) coordinate must be listed first in a bounding'\n", + " f' box corner list. Found {bbox_list}'\n", + " )\n", + " return cls(bbox_list[0], bbox_list[1], bbox_list[2], bbox_list[3])\n", + "\n", + " def to_list(self) -\u003e list[float]:\n", + " return [self.west, self.south, self.east, self.north]\n", + "\n", + " def intersects(self, query_bbox) -\u003e bool:\n", + " \"\"\"Checks if this bbox intersects with the query bbox.\n", + "\n", + " Doesn't handle bboxes extending past the antimeridaian.\n", + "\n", + " Args:\n", + " query_bbox: Bounding box from the query.\n", + "\n", + " Returns:\n", + " True if the two bounding boxes intersect\n", + " \"\"\"\n", + " return (\n", + " query_bbox.west \u003c self.east\n", + " and query_bbox.east \u003e self.west\n", + " and query_bbox.south \u003c self.north\n", + " and query_bbox.north \u003e self.south\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "_CxF7Wz5wUZq" + }, + "outputs": [], + "source": [ + "# @title class Collection()\n", + "class Collection:\n", + " \"\"\"A simple wrapper for a STAC Collection..\"\"\"\n", + " stac_json: dict[str, Any]\n", + "\n", + " def __init__(self, stac_json: dict[str, Any]):\n", + " self.stac_json = stac_json\n", + " if stac_json.get('gee:status') == 'deprecated':\n", + " # Set the STAC 'deprecated' field that we don't set in the jsonnet files\n", + " stac_json['deprecated'] = True\n", + "\n", + " def __getitem__(self, item: str) -\u003e Any:\n", + " return self.stac_json[item]\n", + "\n", + " def get(self, item: str, default: Optional[Any] = None) -\u003e Optional[Any]:\n", + " \"\"\"Matches dict's get by returning None if there is no item.\"\"\"\n", + " return self.stac_json.get(item, default)\n", + "\n", + " def public_id(self) -\u003e str:\n", + " return self['id']\n", + "\n", + " def hyphen_id(self) -\u003e str:\n", + " return self['id'].replace('/', '_')\n", + "\n", + " def get_dataset_type(self) -\u003e str:\n", + " \"\"\"Could be Image, ImageCollection, FeatureCollection, Feature.\"\"\"\n", + " return self['gee:type']\n", + "\n", + " def is_deprecated(self) -\u003e bool:\n", + " \"\"\"Returns True for collections that are deprecated or have a successor.\"\"\"\n", + " if self.get('deprecated', False):\n", + " logging.info('Skipping deprecated collection: %s', self.public_id())\n", + " return True\n", + "\n", + " def datetime_interval(\n", + " self,\n", + " ) -\u003e Iterable[tuple[datetime.datetime, Optional[datetime.datetime]]]:\n", + " \"\"\"Returns datetime objects representing temporal extents.\"\"\"\n", + " for stac_interval in self.stac_json['extent']['temporal']['interval']:\n", + " if not stac_interval[0]:\n", + " raise ValueError(\n", + " 'Expected a non-empty temporal interval start for '\n", + " + self.public_id()\n", + " )\n", + " start_date = iso8601.parse_date(stac_interval[0])\n", + " if stac_interval[1] is not None:\n", + " end_date = iso8601.parse_date(stac_interval[1])\n", + " else:\n", + " end_date = None\n", + " yield (start_date, end_date)\n", + "\n", + " def start(self) -\u003e datetime.datetime:\n", + " return list(self.datetime_interval())[0][0]\n", + "\n", + " def start_str(self) -\u003e datetime.datetime:\n", + " if not self.start():\n", + " return ''\n", + " return self.start().strftime(\"%Y-%m-%d\")\n", + "\n", + " def end(self) -\u003e Optional[datetime.datetime]:\n", + " return list(self.datetime_interval())[0][1]\n", + "\n", + " def end_str(self) -\u003e Optional[datetime.datetime]:\n", + " if not self.end():\n", + " return ''\n", + " return self.end().strftime(\"%Y-%m-%d\")\n", + "\n", + " def bbox_list(self) -\u003e Sequence[BBox]:\n", + " if 'extent' not in self.stac_json:\n", + " # Assume global if nothing listed.\n", + " return (BBox(-180, -90, 180, 90),)\n", + " return tuple([\n", + " BBox.from_list(x)\n", + " for x in self.stac_json['extent']['spatial']['bbox']\n", + " ])\n", + "\n", + " def bands(self) -\u003e List[Dict]:\n", + " summaries = self.stac_json.get('summaries')\n", + " if not summaries:\n", + " return []\n", + " return summaries.get('eo:bands', [])\n", + "\n", + " def spatial_resolution_m(self) -\u003e float:\n", + " summaries = self.stac_json.get('summaries')\n", + " if not summaries:\n", + " return -1\n", + " if summaries.get('gsd'):\n", + " return summaries.get('gsd')[0]\n", + "\n", + " # Hacky fallback for cases where the stac does not follow convention.\n", + " gsd_lst = re.findall(r'\"gsd\": (\\d+)', json.dumps(self.stac_json))\n", + "\n", + " if len(gsd_lst) \u003e 0:\n", + " return float(gsd_lst[0])\n", + "\n", + " return -1\n", + "\n", + "\n", + " def temporal_resolution_str(self) -\u003e str:\n", + " interval_dict = self.stac_json.get('gee:interval')\n", + " if not interval_dict:\n", + " return \"\"\n", + " return f\"{interval_dict['interval']} {interval_dict['unit']}\"\n", + "\n", + "\n", + " def set_js_code(self, code: str):\n", + " if not code:\n", + " return ''\n", + " js_code = self.stac_json.get('code').get('js_code')\n", + " self.stac_json['code'] = {'js_code': ''}\n", + "\n", + " def image_preview_url(self):\n", + " for link in self.stac_json['links']:\n", + " if 'rel' in link and link['rel'] == 'preview' and link['type'] == 'image/png':\n", + " return link['href']\n", + " raise ValueError(f\"No preview image found for {id}\")\n", + "\n", + "\n", + " def catalog_url(self):\n", + " links = self.stac_json['links']\n", + " for link in links:\n", + " if 'rel' in link and link['rel'] == 'catalog':\n", + " return link['href']\n", + "\n", + " # Ideally there would be a 'catalog' link but sometimes there isn't.\n", + " base_url = \"https://developers.google.com/earth-engine/datasets/catalog/\"\n", + " if link['href'].startswith(base_url):\n", + " return link['href'].split('#')[0]\n", + "\n", + " logging.warning(f\"No catalog link found for {self.public_id()}\")\n", + " return \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "eJVB0cDYweD5" + }, + "outputs": [], + "source": [ + "# @title class CollectionList()\n", + "class CollectionList(Sequence[Collection]):\n", + " \"\"\"List of stac.Collections; can be filtered to return a smaller sublist.\"\"\"\n", + "\n", + " _collections = Sequence[Collection]\n", + "\n", + " def __init__(self, collections: Sequence[Collection]):\n", + " self._collections = tuple(collections)\n", + "\n", + " def __iter__(self):\n", + " return iter(self._collections)\n", + "\n", + " def __getitem__(self, index):\n", + " return self._collections[index]\n", + "\n", + " def __len__(self):\n", + " return len(self._collections)\n", + "\n", + " def __eq__(self, other: object) -\u003e bool:\n", + " if isinstance(other, CollectionList):\n", + " return self._collections == other._collections\n", + " return False\n", + "\n", + " def __hash__(self) -\u003e int:\n", + " return hash(self._collections)\n", + "\n", + " def filter_by_ids(self, ids: Iterable[str]):\n", + " \"\"\"Returns a sublist with only the collections matching the given ids.\"\"\"\n", + " return self.__class__(\n", + " [c for c in self._collections if c.public_id() in ids]\n", + " )\n", + "\n", + " def filter_by_datetime(\n", + " self,\n", + " query_datetime: datetime.datetime,\n", + " ):\n", + " \"\"\"Returns a sublist with the time interval matching the given time.\"\"\"\n", + " result = []\n", + " for collection in self._collections:\n", + " for datetime_interval in collection.datetime_interval():\n", + " if matches_datetime(datetime_interval, query_datetime):\n", + " result.append(collection)\n", + " break\n", + " return self.__class__(result)\n", + "\n", + " def filter_by_interval(\n", + " self,\n", + " query_interval: tuple[datetime.datetime, datetime.datetime],\n", + " ):\n", + " \"\"\"Returns a sublist with the time interval matching the given interval.\"\"\"\n", + " result = []\n", + " for collection in self._collections:\n", + " for datetime_interval in collection.datetime_interval():\n", + " if matches_interval(datetime_interval, query_interval):\n", + " result.append(collection)\n", + " break\n", + " return self.__class__(result)\n", + "\n", + " def filter_by_bounding_box_list(\n", + " self, query_bbox: BBox):\n", + " \"\"\"Returns a sublist with the bbox matching the given bbox.\"\"\"\n", + " result = []\n", + " for collection in self._collections:\n", + " for collection_bbox in collection.bbox_list():\n", + " if collection_bbox.intersects(query_bbox):\n", + " result.append(collection)\n", + " break\n", + " return self.__class__(result)\n", + "\n", + " def filter_by_bounding_box(\n", + " self, query_bbox: BBox):\n", + " \"\"\"Returns a sublist with the bbox matching the given bbox.\"\"\"\n", + " result = []\n", + " for collection in self._collections:\n", + " for collection_bbox in collection.bbox_list():\n", + " if collection_bbox.intersects(query_bbox):\n", + " result.append(collection)\n", + " break\n", + " return self.__class__(result)\n", + "\n", + "\n", + " def start_str(self) -\u003e datetime.datetime:\n", + " return self.start().strftime(\"%Y-%m-%d\")\n", + "\n", + "\n", + " def sort_by_spatial_resolution(self, reverse=False):\n", + " \"\"\"\n", + " Sorts the collections based on their spatial resolution.\n", + " Collections with spatial_resolution_m() == -1 are pushed to the end.\n", + "\n", + " Args:\n", + " reverse (bool): If True, sort in descending order (highest resolution first).\n", + " If False (default), sort in ascending order (lowest resolution first).\n", + "\n", + " Returns:\n", + " CollectionList: A new CollectionList instance with sorted collections.\n", + " \"\"\"\n", + " def sort_key(collection):\n", + " resolution = collection.spatial_resolution_m()\n", + " if resolution == -1:\n", + " return float('inf') if not reverse else float('-inf')\n", + " return resolution\n", + "\n", + " sorted_collections = sorted(\n", + " self._collections,\n", + " key=sort_key,\n", + " reverse=reverse\n", + " )\n", + " return self.__class__(sorted_collections)\n", + "\n", + "\n", + " def limit(self, n: int):\n", + " \"\"\"\n", + " Returns a new CollectionList containing the first n entries.\n", + "\n", + " Args:\n", + " n (int): The number of entries to include in the new list.\n", + "\n", + " Returns:\n", + " CollectionList: A new CollectionList instance with at most n collections.\n", + " \"\"\"\n", + " return self.__class__(self._collections[:n])\n", + "\n", + "\n", + " def to_df(self):\n", + " \"\"\"Converts a collection list to a dataframe with a select set of fields.\"\"\"\n", + "\n", + " rows = []\n", + " for col in self._collections:\n", + " # Remove text in parens in dataset name.\n", + " short_title = re.sub(r'\\([^)]*\\)', '', col.get('title')).strip()\n", + "\n", + " row = {\n", + " 'id': col.public_id(),\n", + " 'name': short_title,\n", + " 'temp_res': col.temporal_resolution_str(),\n", + " 'spatial_res_m': col.spatial_resolution_m(),\n", + " 'earliest': col.start_str(),\n", + " 'latest': col.end_str(),\n", + " 'url': col.catalog_url()\n", + " }\n", + " rows.append(row)\n", + " return pd.DataFrame(rows)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "Kp39kZxDwlR3" + }, + "outputs": [], + "source": [ + "#@title class Catalog()\n", + "class Catalog:\n", + " \"\"\"Class containing all collections in the EE STAC catalog.\"\"\"\n", + "\n", + " collections: CollectionList\n", + "\n", + " def __init__(self, storage_client: storage.Client):\n", + " self.collections = CollectionList(self._load_collections(storage_client))\n", + "\n", + " def get_collection(self, id: str) -\u003e Collection:\n", + " \"\"\"Returns the collection with the given id.\"\"\"\n", + " col = self.collections.filter_by_ids([id])\n", + " if len(col) == 0:\n", + " raise ValueError(f'No collection with id {id}')\n", + " return col[0]\n", + "\n", + "\n", + " @tenacity.retry(\n", + " stop=tenacity.stop_after_attempt(5),\n", + " wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),\n", + " retry=tenacity.retry_if_exception_type((\n", + " google_exceptions.GoogleAPICallError,\n", + " google_exceptions.RetryError,\n", + " ConnectionError\n", + " )),\n", + " before_sleep=lambda retry_state: print(\n", + " f\"Error occurred: {str(retry_state.outcome.exception())}\\n\"\n", + " f\"Retrying in {retry_state.next_action.sleep} seconds... \"\n", + " f\"(Attempt {retry_state.attempt_number}/3)\"\n", + " )\n", + " )\n", + " def _read_file(self, file_blob: storage.blob.Blob) -\u003e Collection:\n", + " \"\"\"Reads the contents of a file from the specified bucket.\"\"\"\n", + " file_contents = file_blob.download_as_string().decode()\n", + " return Collection(json.loads(file_contents))\n", + "\n", + " def _read_files(\n", + " self, file_blobs: list[storage.blob.Blob]\n", + " ) -\u003e list[Collection]:\n", + " \"\"\"Processes files in parallel.\"\"\"\n", + " collections = []\n", + " with futures.ThreadPoolExecutor(max_workers=10) as executor:\n", + " file_futures = [\n", + " executor.submit(self._read_file, file_blob)\n", + " for file_blob in file_blobs\n", + " ]\n", + " for future in file_futures:\n", + " collections.append(future.result())\n", + " return collections\n", + "\n", + " def _load_collections(\n", + " self, storage_client: storage.Client\n", + " ) -\u003e Sequence[Collection]:\n", + " \"\"\"Loads all EE STAC JSON files from GCS, with datetimes as objects.\"\"\"\n", + " bucket = storage_client.get_bucket('earthengine-stac')\n", + " files = [\n", + " x\n", + " for x in bucket.list_blobs(prefix='catalog/')\n", + " if x.name.endswith('.json')\n", + " and not x.name.endswith('/catalog.json')\n", + " and not x.name.endswith('/units.json')\n", + " ]\n", + " logging.warning('Found %d files, loading...', len(files))\n", + " collections = self._read_files(files)\n", + "\n", + " code_samples_dict = self._load_all_code_samples(storage_client)\n", + "\n", + " res = []\n", + " for c in collections:\n", + " if c.is_deprecated():\n", + " continue\n", + " c.stac_json['code'] = code_samples_dict.get(c.hyphen_id())\n", + " res.append(c)\n", + " logging.warning(\n", + " 'Loaded %d collections (skipping deprecated ones)', len(res)\n", + " )\n", + " # Returning a tuple for immutability.\n", + " return tuple(res)\n", + "\n", + " def _load_all_code_samples(self, storage_client: storage.Client):\n", + " \"\"\"Loads js + py example scripts from GCS into dict keyed by dataset ID.\"\"\"\n", + "\n", + " # Get json file from GCS bucket\n", + " # 'gs://earthengine-catalog/catalog/example_scripts.json'\n", + " bucket = storage_client.get_bucket('earthengine-catalog')\n", + " blob= bucket.blob('catalog/example_scripts.json')\n", + " file_contents = blob.download_as_string().decode()\n", + " data = json.loads(file_contents)\n", + "\n", + " # Flatten json to get a map from ID (using '_' rather than '/') to code\n", + " # sample.\n", + " all_datasets_by_provider = data[0]['contents']\n", + " code_samples_dict = {}\n", + " for provider in all_datasets_by_provider:\n", + " for dataset in provider['contents']:\n", + " js_code = dataset['code']\n", + "\n", + " code_samples_dict[dataset['name']] = {\n", + " 'js_code': js_code}\n", + "\n", + " return code_samples_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gcCwKks05JzB" + }, + "source": [ + "## Test catalog/collection functions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 7349, + "status": "ok", + "timestamp": 1726774360043, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "77aNEPVKT5hH", + "outputId": "bcafc6ac-857d-447d-d698-82a33e4c9c56" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Found 1010 files, loading...\n", + "WARNING:root:Loaded 812 collections (skipping deprecated ones)\n" + ] + } + ], + "source": [ + "catalog = Catalog(storage_client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "executionInfo": { + "elapsed": 209, + "status": "ok", + "timestamp": 1726774465278, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "7D0D-JsNXH-V", + "outputId": "07db2516-9a5d-460e-cd42-f7d0ce8de5e4" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\u003ctable border=\"1\" class=\"dataframe\"\u003e\n", + " \u003cthead\u003e\n", + " \u003ctr style=\"text-align: right;\"\u003e\n", + " \u003cth\u003e\u003c/th\u003e\n", + " \u003cth\u003eid\u003c/th\u003e\n", + " \u003cth\u003ename\u003c/th\u003e\n", + " \u003cth\u003etemp_res\u003c/th\u003e\n", + " \u003cth\u003espatial_res_m\u003c/th\u003e\n", + " \u003cth\u003eearliest\u003c/th\u003e\n", + " \u003cth\u003elatest\u003c/th\u003e\n", + " \u003cth\u003eurl\u003c/th\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/thead\u003e\n", + " \u003ctbody\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e0\u003c/th\u003e\n", + " \u003ctd\u003eCGIAR/SRTM90_V4\u003c/td\u003e\n", + " \u003ctd\u003eSRTM Digital Elevation Data Version 4\u003c/td\u003e\n", + " \u003ctd\u003e\u003c/td\u003e\n", + " \u003ctd\u003e90.00\u003c/td\u003e\n", + " \u003ctd\u003e2000-02-11\u003c/td\u003e\n", + " \u003ctd\u003e2000-02-22\u003c/td\u003e\n", + " \u003ctd\u003e\u003ca href=\"https://developers.google.com/earth-engine/datasets/catalog/CGIAR_SRTM90_V4\" target=\"_blank\"\u003ehttps://developers.google.com/earth-engine/datasets/catalog/CGIAR_SRTM90_V4\u003c/a\u003e\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e1\u003c/th\u003e\n", + " \u003ctd\u003eCIESIN/GPWv411/GPW_Land_Area\u003c/td\u003e\n", + " \u003ctd\u003eGPWv411: Land Area\u003c/td\u003e\n", + " \u003ctd\u003e\u003c/td\u003e\n", + " \u003ctd\u003e927.67\u003c/td\u003e\n", + " \u003ctd\u003e2000-01-01\u003c/td\u003e\n", + " \u003ctd\u003e2020-01-01\u003c/td\u003e\n", + " \u003ctd\u003e\u003ca href=\"https://developers.google.com/earth-engine/datasets/catalog/CIESIN_GPWv411_GPW_Land_Area\" target=\"_blank\"\u003ehttps://developers.google.com/earth-engine/datasets/catalog/CIESIN_GPWv411_GPW_Land_Area\u003c/a\u003e\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/tbody\u003e\n", + "\u003c/table\u003e" + ], + "text/plain": [ + "\u003cIPython.core.display.HTML object\u003e" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "col_list = catalog.collections.filter_by_ids(['CGIAR/SRTM90_V4', 'CIESIN/GPWv411/GPW_Land_Area'])\n", + "col_list\n", + "df = col_list.to_df()\n", + "HTML(df.to_html(render_links=True, escape=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ySYDz8HzJyxF" + }, + "source": [ + "# Generate Dataset summaries and embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3_2ZlVF8pOaS" + }, + "outputs": [], + "source": [ + "# @title Source code for dataset summarization and embedding modules\n", + "\n", + "@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))\n", + "def summarize_text(text: str, llm: BaseLanguageModel) -\u003e str:\n", + " \"\"\"Summarize a given text using a language model.\n", + "\n", + " This function splits the input text into chunks, then uses a map-reduce\n", + " summarization chain to generate a summary.\n", + "\n", + " Args:\n", + " text (str): The text to be summarized.\n", + " llm (BaseLanguageModel): The language model to use for summarization.\n", + "\n", + " Returns:\n", + " str: The summarized text.\n", + "\n", + " Raises:\n", + " Exception: If summarization fails after 5 attempts.\n", + " \"\"\"\n", + " # Remove newlines in description\n", + " text = re.sub('\\n\\s*', ' ', text)\n", + "\n", + " text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=1000,\n", + " chunk_overlap=200,\n", + " length_function=len,\n", + " )\n", + "\n", + " docs = text_splitter.create_documents([text])\n", + " chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n", + " return chain.run(docs)\n", + "\n", + "\n", + "def summarize_collection(collection: 'Collection', llm: BaseLanguageModel) -\u003e Dict[str, str]:\n", + " \"\"\"Summarize the dataset description and band information for a data collection.\n", + "\n", + " Args:\n", + " collection (Collection): The collection object containing dataset information.\n", + " llm (BaseLanguageModel): The language model to use for summarization.\n", + "\n", + " Returns:\n", + " Dict[str, str]: A dictionary containing the collection's ID, name, and summarized description.\n", + " \"\"\"\n", + " summarized_description = summarize_text(collection.get('description'), llm)\n", + "\n", + " # Adding text about individual bands improves search performance.\n", + " band_descriptions = \"\"\n", + " for band in collection.bands():\n", + " band_descriptions += f'\"{band[\"name\"]}\" represents {band[\"description\"]}\\n'\n", + " if 'gee:classes' in band:\n", + " band_descriptions += \" Classes:\\n\"\n", + " for band_class in band['gee:classes']:\n", + " band_descriptions += f' {band_class[\"description\"]}\\n'\n", + "\n", + " summarized_description = summarized_description + \"\\n\\n\" + band_descriptions\n", + "\n", + " return {\n", + " 'id': collection.public_id(),\n", + " 'name': collection.get('title'),\n", + " 'summary': summarized_description\n", + " }\n", + "\n", + "\n", + "def summarize_ee_catalog(catalog: 'Catalog', llm: BaseLanguageModel, output_path: Optional[str] = None) -\u003e pd.DataFrame:\n", + " \"\"\"Generate summaries of all dataset descriptions in an Earth Engine data catalog.\n", + "\n", + " This function processes all collections in the catalog concurrently,\n", + " summarizing each collection's description and band information.\n", + "\n", + " Args:\n", + " catalog (Catalog): The Earth Engine data catalog to summarize.\n", + " llm (BaseLanguageModel): The language model to use for summarization.\n", + " output_path (Optional[str]): If provided, the path to save the output DataFrame as a JSON file.\n", + "\n", + " Returns:\n", + " pd.DataFrame: A DataFrame containing the summarized information for all collections.\n", + "\n", + " Note:\n", + " This function uses a ThreadPoolExecutor with a maximum of 8 workers to handle\n", + " potential throttling issues with the language model API.\n", + " \"\"\"\n", + " summarize_collection_partial = partial(summarize_collection, llm=llm)\n", + "\n", + " with ThreadPoolExecutor(max_workers=8) as executor:\n", + " results = list(tqdm.tqdm(\n", + " executor.map(summarize_collection_partial, catalog.collections),\n", + " total=len(catalog.collections)))\n", + " return results\n", + " df = pd.DataFrame(results)\n", + "\n", + " if output_path:\n", + " with open(output_path, 'w') as f:\n", + " f.write(df.to_json(orient='records', lines=True))\n", + " return df\n", + "\n", + "\n", + "\n", + "def get_embeddings_wrapper(texts: List[str], model: TextEmbeddingModel):\n", + " # VertexAI allows you to send batches of 5 embeddings requests at once.\n", + " BATCH_SIZE = 5\n", + " embs = []\n", + " for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):\n", + " time.sleep(1) # to avoid the quota error\n", + " result = model.get_embeddings(texts[i : i + BATCH_SIZE])\n", + " embs = embs + [e.values for e in result]\n", + " return embs\n", + "\n", + "\n", + "def add_embeddings_to_df(\n", + " df: pd.DataFrame, col_to_embed: str, model: TextEmbeddingModel) -\u003e pd.DataFrame:\n", + " get_embeddings_partial = partial(get_embeddings_wrapper, model=model)\n", + " df = df.assign(embedding=get_embeddings_partial(list(df[col_to_embed])))\n", + " return df\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b6tUGqPVr5nT" + }, + "outputs": [], + "source": [ + "import google\n", + "from google.cloud import storage\n", + "\n", + "# @title Initialize Language and Text embedding models plus output destinations\n", + "\n", + "\n", + "# We use Gemini 1.5 pro to summarize the original dataset descriptions\n", + "gemini_llm = ChatGoogleGenerativeAI(model=\"gemini-1.5-pro\", google_api_key=userdata.get('GOOGLE_API_KEY'))\n", + "\n", + "# We use a VertexAI model for embedding the dataset summaries to eventually be\n", + "# loaded into a Vectorstore.\n", + "embedding_model = TextEmbeddingModel.from_pretrained(\"google/text-embedding-004\")\n", + "\n", + "# We write the output to disk to reduce the risk of needing to rerun.\n", + "CATALOG_SUMMARIES_PATH = 'ee_catalog_summaries.jsonl'\n", + "EMBEDDINGS_LOCAL_PATH = 'ee_catalog_embeddings.jsonl'\n", + "\n", + "# Eventually we upload embeddings and summaries to GCS.\n", + "GCP_PROJECT = userdata.get('GOOGLE_PROJECT_ID')\n", + "DESTINATION_BUCKET = userdata.get('DESTINATION_BUCKET')\n", + "EMBEDDINGS_GCS_PATH = 'ee_catalog_embeddings.jsonl'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 6480, + "status": "ok", + "timestamp": 1726774882919, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "fsPjnXKCwfHv", + "outputId": "5a66d5d0-eaf1-4856-8f9a-842ed436eeee" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Found 1010 files, loading...\n", + "WARNING:root:Loaded 812 collections (skipping deprecated ones)\n" + ] + } + ], + "source": [ + "# @title Load the entire EE Pubic data catalog from GCS:\n", + "catalog = Catalog(storage_client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 579828, + "status": "ok", + "timestamp": 1726775483487, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "NNrrW5kudDX6", + "outputId": "f621d803-62a2-433a-b65a-5ec25d30017d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u003cipython-input-25-d6d21737571e\u003e:31: LangChainDeprecationWarning: The method `Chain.run` was deprecated in langchain 0.1.0 and will be removed in 1.0. Use invoke instead.\n", + " return chain.run(docs)\n", + "100%|██████████| 812/812 [09:39\u003c00:00, 1.40it/s]\n" + ] + } + ], + "source": [ + "#@title Use an LLM to generate per-collection dataset summaries.\n", + "# This tends to take around 10-15 minutes.\n", + "\n", + "summary_json_list = summarize_ee_catalog(catalog, gemini_llm)\n", + "\n", + "# Write to a file so we minimize the need to repeat this time consuming step.\n", + "with open(CATALOG_SUMMARIES_PATH, 'w') as f:\n", + " for entry in summary_json_list:\n", + " json.dump(entry, f)\n", + " f.write('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "executionInfo": { + "elapsed": 254, + "status": "ok", + "timestamp": 1726775913986, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "wFxpE2IftDxt", + "outputId": "4ca4847a-c514-4029-c5a3-c7d5eb31fca1" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"catalog_summary_df\",\n \"rows\": 812,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 723,\n \"samples\": [\n \"MODIS/061/MOD13A3\",\n \"OSU/GIMP/DEM\",\n \"USDOS/LSIB_SIMPLE/2017\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 722,\n \"samples\": [\n \"MOD13A2.061 Terra Vegetation Indices 16-Day Global 1km\",\n \"Greenland DEM - Greenland Mapping Project (GIMP)\",\n \"Planet SkySat Public Ortho Imagery, RGB\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"summary\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 812,\n \"samples\": [\n \"**Landsat 8 Collection 2 Tier 1 data offers calibrated, top-of-atmosphere reflectance ideal for time-series analysis. Its consistent accuracy across Landsat sensors ensures reliable and comparable data over time.** \\n\\n\\n\\\"B1\\\" represents Coastal aerosol\\n\\\"B2\\\" represents Blue\\n\\\"B3\\\" represents Green\\n\\\"B4\\\" represents Red\\n\\\"B5\\\" represents Near infrared\\n\\\"B6\\\" represents Shortwave infrared 1\\n\\\"B7\\\" represents Shortwave infrared 2\\n\\\"B8\\\" represents Band 8 Panchromatic\\n\\\"B9\\\" represents Cirrus\\n\\\"B10\\\" represents Thermal infrared 1, resampled from 100m to 30m\\n\\\"B11\\\" represents Thermal infrared 2, resampled from 100m to 30m\\n\\\"QA_PIXEL\\\" represents Landsat Collection 2 OLI/TIRS QA Bitmask\\n\\n\\\"QA_RADSAT\\\" represents Radiometric saturation QA\\n\\\"SAA\\\" represents Solar Azimuth Angle\\n\\\"SZA\\\" represents Solar Zenith Angle\\n\\\"VAA\\\" represents View Azimuth Angle\\n\\\"VZA\\\" represents View Zenith Angle\\n\",\n \"The VNP64A1 data product uses daily images from the Suomi NPP satellite to identify burned areas globally. It provides monthly, 500m resolution maps that show fire locations and the reliability of those assessments. \\n\\n\\n\\\"Burn_Date\\\" represents Ordinal day of burn (1-366) for each 500-m grid cell.\\n\\n\\\"Burn_Date_Uncertainty\\\" represents Estimated uncertainty in date of burn, in days. Unburned, unmapped,\\nand water grid cells are masked out.\\n\\n\\\"QA\\\" represents Quality Assurance Indicators\\n\\\"First_Day\\\" represents First day of the year of reliable change detection\\n\\\"Last_Day\\\" represents Last day of the year of reliable change detection\\n\",\n \"Landsat Collection 2 Tier 1 Level 2 composites provide 32-day NDVI snapshots, using the latest available data, covering most of the year and excluding high latitudes, late Landsat 7, and early Landsat 8 imagery due to data quality issues. \\n\\n\\n\\\"NDVI\\\" represents Normalized Difference Vegetation Index\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "catalog_summary_df" + }, + "text/html": [ + "\n", + " \u003cdiv id=\"df-8eae13b5-051b-41bc-b2ef-c54253d4a192\" class=\"colab-df-container\"\u003e\n", + " \u003cdiv\u003e\n", + "\u003cstyle scoped\u003e\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "\u003c/style\u003e\n", + "\u003ctable border=\"1\" class=\"dataframe\"\u003e\n", + " \u003cthead\u003e\n", + " \u003ctr style=\"text-align: right;\"\u003e\n", + " \u003cth\u003e\u003c/th\u003e\n", + " \u003cth\u003eid\u003c/th\u003e\n", + " \u003cth\u003ename\u003c/th\u003e\n", + " \u003cth\u003esummary\u003c/th\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/thead\u003e\n", + " \u003ctbody\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e0\u003c/th\u003e\n", + " \u003ctd\u003eAAFC/ACI\u003c/td\u003e\n", + " \u003ctd\u003eCanada AAFC Annual Crop Inventory\u003c/td\u003e\n", + " \u003ctd\u003eAgriculture and Agri-Food Canada annually maps...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e1\u003c/th\u003e\n", + " \u003ctd\u003eACA/reef_habitat/v2_0\u003c/td\u003e\n", + " \u003ctd\u003eAllen Coral Atlas (ACA) - Geomorphic Zonation ...\u003c/td\u003e\n", + " \u003ctd\u003eThe Allen Coral Atlas, a global, high-resoluti...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e2\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_INT\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Interpolated\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM is a high-resolution (0.5m) digita...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e3\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_NON\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Non-Interpolated\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM is a detailed (0.5m resolution) el...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e4\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_RUW\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Raw Samples\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM, created from 2007-2012 LIDAR data...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/tbody\u003e\n", + "\u003c/table\u003e\n", + "\u003c/div\u003e\n", + " \u003cdiv class=\"colab-df-buttons\"\u003e\n", + "\n", + " \u003cdiv class=\"colab-df-container\"\u003e\n", + " \u003cbutton class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8eae13b5-051b-41bc-b2ef-c54253d4a192')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + " \u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\"\u003e\n", + " \u003cpath d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/\u003e\n", + " \u003c/svg\u003e\n", + " \u003c/button\u003e\n", + "\n", + " \u003cstyle\u003e\n", + " .colab-df-container {\n", + " display:flex;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " .colab-df-buttons div {\n", + " margin-bottom: 4px;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " \u003c/style\u003e\n", + "\n", + " \u003cscript\u003e\n", + " const buttonEl =\n", + " document.querySelector('#df-8eae13b5-051b-41bc-b2ef-c54253d4a192 button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-8eae13b5-051b-41bc-b2ef-c54253d4a192');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '\u003ca target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb\u003edata table notebook\u003c/a\u003e'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " \u003c/script\u003e\n", + " \u003c/div\u003e\n", + "\n", + "\n", + "\u003cdiv id=\"df-e3ea3bb1-c269-4bcf-8bba-d3f72a6500d3\"\u003e\n", + " \u003cbutton class=\"colab-df-quickchart\" onclick=\"quickchart('df-e3ea3bb1-c269-4bcf-8bba-d3f72a6500d3')\"\n", + " title=\"Suggest charts\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + "\u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\"\u003e\n", + " \u003cg\u003e\n", + " \u003cpath d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/\u003e\n", + " \u003c/g\u003e\n", + "\u003c/svg\u003e\n", + " \u003c/button\u003e\n", + "\n", + "\u003cstyle\u003e\n", + " .colab-df-quickchart {\n", + " --bg-color: #E8F0FE;\n", + " --fill-color: #1967D2;\n", + " --hover-bg-color: #E2EBFA;\n", + " --hover-fill-color: #174EA6;\n", + " --disabled-fill-color: #AAA;\n", + " --disabled-bg-color: #DDD;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-quickchart {\n", + " --bg-color: #3B4455;\n", + " --fill-color: #D2E3FC;\n", + " --hover-bg-color: #434B5C;\n", + " --hover-fill-color: #FFFFFF;\n", + " --disabled-bg-color: #3B4455;\n", + " --disabled-fill-color: #666;\n", + " }\n", + "\n", + " .colab-df-quickchart {\n", + " background-color: var(--bg-color);\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: var(--fill-color);\n", + " height: 32px;\n", + " padding: 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-quickchart:hover {\n", + " background-color: var(--hover-bg-color);\n", + " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: var(--button-hover-fill-color);\n", + " }\n", + "\n", + " .colab-df-quickchart-complete:disabled,\n", + " .colab-df-quickchart-complete:disabled:hover {\n", + " background-color: var(--disabled-bg-color);\n", + " fill: var(--disabled-fill-color);\n", + " box-shadow: none;\n", + " }\n", + "\n", + " .colab-df-spinner {\n", + " border: 2px solid var(--fill-color);\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " animation:\n", + " spin 1s steps(1) infinite;\n", + " }\n", + "\n", + " @keyframes spin {\n", + " 0% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " border-left-color: var(--fill-color);\n", + " }\n", + " 20% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 30% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 40% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 60% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 80% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " 90% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " }\n", + "\u003c/style\u003e\n", + "\n", + " \u003cscript\u003e\n", + " async function quickchart(key) {\n", + " const quickchartButtonEl =\n", + " document.querySelector('#' + key + ' button');\n", + " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", + " quickchartButtonEl.classList.add('colab-df-spinner');\n", + " try {\n", + " const charts = await google.colab.kernel.invokeFunction(\n", + " 'suggestCharts', [key], {});\n", + " } catch (error) {\n", + " console.error('Error during call to suggestCharts:', error);\n", + " }\n", + " quickchartButtonEl.classList.remove('colab-df-spinner');\n", + " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", + " }\n", + " (() =\u003e {\n", + " let quickchartButtonEl =\n", + " document.querySelector('#df-e3ea3bb1-c269-4bcf-8bba-d3f72a6500d3 button');\n", + " quickchartButtonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + " })();\n", + " \u003c/script\u003e\n", + "\u003c/div\u003e\n", + "\n", + " \u003c/div\u003e\n", + " \u003c/div\u003e\n" + ], + "text/plain": [ + " id name \\\n", + "0 AAFC/ACI Canada AAFC Annual Crop Inventory \n", + "1 ACA/reef_habitat/v2_0 Allen Coral Atlas (ACA) - Geomorphic Zonation ... \n", + "2 AHN/AHN2_05M_INT AHN Netherlands 0.5m DEM, Interpolated \n", + "3 AHN/AHN2_05M_NON AHN Netherlands 0.5m DEM, Non-Interpolated \n", + "4 AHN/AHN2_05M_RUW AHN Netherlands 0.5m DEM, Raw Samples \n", + "\n", + " summary \n", + "0 Agriculture and Agri-Food Canada annually maps... \n", + "1 The Allen Coral Atlas, a global, high-resoluti... \n", + "2 The AHN DEM is a high-resolution (0.5m) digita... \n", + "3 The AHN DEM is a detailed (0.5m resolution) el... \n", + "4 The AHN DEM, created from 2007-2012 LIDAR data... " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# @title View summary results\n", + "catalog_summary_df = pd.read_json(CATALOG_SUMMARIES_PATH, lines=True)\n", + "catalog_summary_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 223 + }, + "executionInfo": { + "elapsed": 202090, + "status": "ok", + "timestamp": 1726776121492, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "MtziPdrogxbg", + "outputId": "cfb8e53d-e790-453c-9cc1-9cca839feaf4" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 163/163 [03:21\u003c00:00, 1.23s/it]\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"embedding_df\",\n \"rows\": 812,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 723,\n \"samples\": [\n \"MODIS/061/MOD13A3\",\n \"OSU/GIMP/DEM\",\n \"USDOS/LSIB_SIMPLE/2017\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 722,\n \"samples\": [\n \"MOD13A2.061 Terra Vegetation Indices 16-Day Global 1km\",\n \"Greenland DEM - Greenland Mapping Project (GIMP)\",\n \"Planet SkySat Public Ortho Imagery, RGB\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"summary\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 812,\n \"samples\": [\n \"**Landsat 8 Collection 2 Tier 1 data offers calibrated, top-of-atmosphere reflectance ideal for time-series analysis. Its consistent accuracy across Landsat sensors ensures reliable and comparable data over time.** \\n\\n\\n\\\"B1\\\" represents Coastal aerosol\\n\\\"B2\\\" represents Blue\\n\\\"B3\\\" represents Green\\n\\\"B4\\\" represents Red\\n\\\"B5\\\" represents Near infrared\\n\\\"B6\\\" represents Shortwave infrared 1\\n\\\"B7\\\" represents Shortwave infrared 2\\n\\\"B8\\\" represents Band 8 Panchromatic\\n\\\"B9\\\" represents Cirrus\\n\\\"B10\\\" represents Thermal infrared 1, resampled from 100m to 30m\\n\\\"B11\\\" represents Thermal infrared 2, resampled from 100m to 30m\\n\\\"QA_PIXEL\\\" represents Landsat Collection 2 OLI/TIRS QA Bitmask\\n\\n\\\"QA_RADSAT\\\" represents Radiometric saturation QA\\n\\\"SAA\\\" represents Solar Azimuth Angle\\n\\\"SZA\\\" represents Solar Zenith Angle\\n\\\"VAA\\\" represents View Azimuth Angle\\n\\\"VZA\\\" represents View Zenith Angle\\n\",\n \"The VNP64A1 data product uses daily images from the Suomi NPP satellite to identify burned areas globally. It provides monthly, 500m resolution maps that show fire locations and the reliability of those assessments. \\n\\n\\n\\\"Burn_Date\\\" represents Ordinal day of burn (1-366) for each 500-m grid cell.\\n\\n\\\"Burn_Date_Uncertainty\\\" represents Estimated uncertainty in date of burn, in days. Unburned, unmapped,\\nand water grid cells are masked out.\\n\\n\\\"QA\\\" represents Quality Assurance Indicators\\n\\\"First_Day\\\" represents First day of the year of reliable change detection\\n\\\"Last_Day\\\" represents Last day of the year of reliable change detection\\n\",\n \"Landsat Collection 2 Tier 1 Level 2 composites provide 32-day NDVI snapshots, using the latest available data, covering most of the year and excluding high latitudes, late Landsat 7, and early Landsat 8 imagery due to data quality issues. \\n\\n\\n\\\"NDVI\\\" represents Normalized Difference Vegetation Index\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embedding\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "embedding_df" + }, + "text/html": [ + "\n", + " \u003cdiv id=\"df-afc7c4c9-58a9-4be0-8445-5b3b47270db9\" class=\"colab-df-container\"\u003e\n", + " \u003cdiv\u003e\n", + "\u003cstyle scoped\u003e\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "\u003c/style\u003e\n", + "\u003ctable border=\"1\" class=\"dataframe\"\u003e\n", + " \u003cthead\u003e\n", + " \u003ctr style=\"text-align: right;\"\u003e\n", + " \u003cth\u003e\u003c/th\u003e\n", + " \u003cth\u003eid\u003c/th\u003e\n", + " \u003cth\u003ename\u003c/th\u003e\n", + " \u003cth\u003esummary\u003c/th\u003e\n", + " \u003cth\u003eembedding\u003c/th\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/thead\u003e\n", + " \u003ctbody\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e0\u003c/th\u003e\n", + " \u003ctd\u003eAAFC/ACI\u003c/td\u003e\n", + " \u003ctd\u003eCanada AAFC Annual Crop Inventory\u003c/td\u003e\n", + " \u003ctd\u003eAgriculture and Agri-Food Canada annually maps...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.03112766332924366, 0.022871049121022224, -...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e1\u003c/th\u003e\n", + " \u003ctd\u003eACA/reef_habitat/v2_0\u003c/td\u003e\n", + " \u003ctd\u003eAllen Coral Atlas (ACA) - Geomorphic Zonation ...\u003c/td\u003e\n", + " \u003ctd\u003eThe Allen Coral Atlas, a global, high-resoluti...\u003c/td\u003e\n", + " \u003ctd\u003e[0.006329342722892761, 0.056551311165094376, -...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e2\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_INT\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Interpolated\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM is a high-resolution (0.5m) digita...\u003c/td\u003e\n", + " \u003ctd\u003e[0.0030822136905044317, -0.06489657610654831, ...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e3\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_NON\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Non-Interpolated\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM is a detailed (0.5m resolution) el...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.014630626887083054, -0.07648028433322906, ...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e4\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_RUW\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Raw Samples\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM, created from 2007-2012 LIDAR data...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.008305594325065613, -0.07478459924459457, ...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/tbody\u003e\n", + "\u003c/table\u003e\n", + "\u003c/div\u003e\n", + " \u003cdiv class=\"colab-df-buttons\"\u003e\n", + "\n", + " \u003cdiv class=\"colab-df-container\"\u003e\n", + " \u003cbutton class=\"colab-df-convert\" onclick=\"convertToInteractive('df-afc7c4c9-58a9-4be0-8445-5b3b47270db9')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + " \u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\"\u003e\n", + " \u003cpath d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/\u003e\n", + " \u003c/svg\u003e\n", + " \u003c/button\u003e\n", + "\n", + " \u003cstyle\u003e\n", + " .colab-df-container {\n", + " display:flex;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " .colab-df-buttons div {\n", + " margin-bottom: 4px;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " \u003c/style\u003e\n", + "\n", + " \u003cscript\u003e\n", + " const buttonEl =\n", + " document.querySelector('#df-afc7c4c9-58a9-4be0-8445-5b3b47270db9 button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-afc7c4c9-58a9-4be0-8445-5b3b47270db9');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '\u003ca target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb\u003edata table notebook\u003c/a\u003e'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " \u003c/script\u003e\n", + " \u003c/div\u003e\n", + "\n", + "\n", + "\u003cdiv id=\"df-f4321965-9011-4f94-8680-241cee9b79a6\"\u003e\n", + " \u003cbutton class=\"colab-df-quickchart\" onclick=\"quickchart('df-f4321965-9011-4f94-8680-241cee9b79a6')\"\n", + " title=\"Suggest charts\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + "\u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\"\u003e\n", + " \u003cg\u003e\n", + " \u003cpath d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/\u003e\n", + " \u003c/g\u003e\n", + "\u003c/svg\u003e\n", + " \u003c/button\u003e\n", + "\n", + "\u003cstyle\u003e\n", + " .colab-df-quickchart {\n", + " --bg-color: #E8F0FE;\n", + " --fill-color: #1967D2;\n", + " --hover-bg-color: #E2EBFA;\n", + " --hover-fill-color: #174EA6;\n", + " --disabled-fill-color: #AAA;\n", + " --disabled-bg-color: #DDD;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-quickchart {\n", + " --bg-color: #3B4455;\n", + " --fill-color: #D2E3FC;\n", + " --hover-bg-color: #434B5C;\n", + " --hover-fill-color: #FFFFFF;\n", + " --disabled-bg-color: #3B4455;\n", + " --disabled-fill-color: #666;\n", + " }\n", + "\n", + " .colab-df-quickchart {\n", + " background-color: var(--bg-color);\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: var(--fill-color);\n", + " height: 32px;\n", + " padding: 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-quickchart:hover {\n", + " background-color: var(--hover-bg-color);\n", + " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: var(--button-hover-fill-color);\n", + " }\n", + "\n", + " .colab-df-quickchart-complete:disabled,\n", + " .colab-df-quickchart-complete:disabled:hover {\n", + " background-color: var(--disabled-bg-color);\n", + " fill: var(--disabled-fill-color);\n", + " box-shadow: none;\n", + " }\n", + "\n", + " .colab-df-spinner {\n", + " border: 2px solid var(--fill-color);\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " animation:\n", + " spin 1s steps(1) infinite;\n", + " }\n", + "\n", + " @keyframes spin {\n", + " 0% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " border-left-color: var(--fill-color);\n", + " }\n", + " 20% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 30% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 40% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 60% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 80% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " 90% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " }\n", + "\u003c/style\u003e\n", + "\n", + " \u003cscript\u003e\n", + " async function quickchart(key) {\n", + " const quickchartButtonEl =\n", + " document.querySelector('#' + key + ' button');\n", + " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", + " quickchartButtonEl.classList.add('colab-df-spinner');\n", + " try {\n", + " const charts = await google.colab.kernel.invokeFunction(\n", + " 'suggestCharts', [key], {});\n", + " } catch (error) {\n", + " console.error('Error during call to suggestCharts:', error);\n", + " }\n", + " quickchartButtonEl.classList.remove('colab-df-spinner');\n", + " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", + " }\n", + " (() =\u003e {\n", + " let quickchartButtonEl =\n", + " document.querySelector('#df-f4321965-9011-4f94-8680-241cee9b79a6 button');\n", + " quickchartButtonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + " })();\n", + " \u003c/script\u003e\n", + "\u003c/div\u003e\n", + "\n", + " \u003c/div\u003e\n", + " \u003c/div\u003e\n" + ], + "text/plain": [ + " id name \\\n", + "0 AAFC/ACI Canada AAFC Annual Crop Inventory \n", + "1 ACA/reef_habitat/v2_0 Allen Coral Atlas (ACA) - Geomorphic Zonation ... \n", + "2 AHN/AHN2_05M_INT AHN Netherlands 0.5m DEM, Interpolated \n", + "3 AHN/AHN2_05M_NON AHN Netherlands 0.5m DEM, Non-Interpolated \n", + "4 AHN/AHN2_05M_RUW AHN Netherlands 0.5m DEM, Raw Samples \n", + "\n", + " summary \\\n", + "0 Agriculture and Agri-Food Canada annually maps... \n", + "1 The Allen Coral Atlas, a global, high-resoluti... \n", + "2 The AHN DEM is a high-resolution (0.5m) digita... \n", + "3 The AHN DEM is a detailed (0.5m resolution) el... \n", + "4 The AHN DEM, created from 2007-2012 LIDAR data... \n", + "\n", + " embedding \n", + "0 [-0.03112766332924366, 0.022871049121022224, -... \n", + "1 [0.006329342722892761, 0.056551311165094376, -... \n", + "2 [0.0030822136905044317, -0.06489657610654831, ... \n", + "3 [-0.014630626887083054, -0.07648028433322906, ... \n", + "4 [-0.008305594325065613, -0.07478459924459457, ... " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# @title Calculate embeddings for each dataset summary\n", + "# This takes around 3-5 minutes due to the text embedding model's rate limits.\n", + "\n", + "embedding_df = add_embeddings_to_df(catalog_summary_df, 'summary', embedding_model)\n", + "\n", + "# First store locally, just in case something happens to the Colab runtime.\n", + "with open(EMBEDDINGS_LOCAL_PATH, 'w') as f:\n", + " f.write(embedding_df.to_json(orient='records', lines=True))\n", + "\n", + "# Make sure we can read the embeddings that were written to file.\n", + "embeddings_df = pd.read_json(EMBEDDINGS_LOCAL_PATH, lines=True)\n", + "embedding_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2AjnZCwfvG1q" + }, + "outputs": [], + "source": [ + "#@title Upload embeddings and summaries to GCS\n", + "storage_client = storage.Client(project=GCP_PROJECT)\n", + "\n", + "bucket = google.cloud.storage.bucket.Bucket(\n", + " storage_client, name=DESTINATION_BUCKET, user_project=GCP_PROJECT)\n", + "blob = bucket.blob(EMBEDDINGS_GCS_PATH)\n", + "blob.upload_from_filename(EMBEDDINGS_LOCAL_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423 + }, + "executionInfo": { + "elapsed": 964, + "status": "ok", + "timestamp": 1726776230223, + "user": { + "displayName": "Renee Johnston", + "userId": "00065470300030840468" + }, + "user_tz": 420 + }, + "id": "ZcbGmVnGj-Sr", + "outputId": "8d746bff-a6a5-4eae-943e-a80af18e6d68" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"embeddings_df\",\n \"rows\": 824,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 735,\n \"samples\": [\n \"Slovakia/orthos/25cm\",\n \"COPERNICUS/S3/OLCI\",\n \"LANDSAT/LM04/C02/T1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 727,\n \"samples\": [\n \"PRISM Monthly Spatial Climate Dataset AN81m\",\n \"WWF HydroSHEDS Basins Level 6\",\n \"MYD21C3.061 Aqua Land Surface Temperature and 3-Band Emissivity Monthly L3 Global 0.05 Deg CMG\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"summary\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 824,\n \"samples\": [\n \"This dataset offers global predictions of sand content (%) at different soil depths (0-200cm) with a 250m resolution. The predictions are generated using machine learning models trained on a comprehensive soil profile database. \\n\\n\\n\\\"b0\\\" represents Sand content at 0 cm depth\\n\\\"b10\\\" represents Sand content at 10 cm depth\\n\\\"b30\\\" represents Sand content at 30 cm depth\\n\\\"b60\\\" represents Sand content at 60 cm depth\\n\\\"b100\\\" represents Sand content at 100 cm depth\\n\\\"b200\\\" represents Sand content at 200 cm depth\\n\",\n \"This product provides global Leaf Area Index (LAI) measurements, indicating green leaf density, derived from the GCOM-C satellite's SGLI instrument. This data is crucial for understanding the Earth's climate system and improving climate models. A newer, more accurate version (V3) is available. \\n\\n\\n\\\"LAI_AVE\\\" represents The sum of the one-sided green leaf area per unit ground area.\\n\\\"LAI_QA_flag\\\" represents LAI QA\\n\",\n \"The Physiographic Diversity dataset, using elevation data and the Shannon diversity index, identifies areas with diverse landforms. This information helps climate adaptation planning by highlighting regions with stable physiographic features likely to persist despite uncertain climate changes. High-resolution data ensures maximum usefulness for local planning. \\n\\n\\n\\\"b1\\\" represents Physiographic diversity\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embedding\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "embeddings_df" + }, + "text/html": [ + "\n", + " \u003cdiv id=\"df-e395cf12-1b3c-4f9e-85dc-46a8cfb3987c\" class=\"colab-df-container\"\u003e\n", + " \u003cdiv\u003e\n", + "\u003cstyle scoped\u003e\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "\u003c/style\u003e\n", + "\u003ctable border=\"1\" class=\"dataframe\"\u003e\n", + " \u003cthead\u003e\n", + " \u003ctr style=\"text-align: right;\"\u003e\n", + " \u003cth\u003e\u003c/th\u003e\n", + " \u003cth\u003eid\u003c/th\u003e\n", + " \u003cth\u003ename\u003c/th\u003e\n", + " \u003cth\u003esummary\u003c/th\u003e\n", + " \u003cth\u003eembedding\u003c/th\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/thead\u003e\n", + " \u003ctbody\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e0\u003c/th\u003e\n", + " \u003ctd\u003eAAFC/ACI\u003c/td\u003e\n", + " \u003ctd\u003eCanada AAFC Annual Crop Inventory\u003c/td\u003e\n", + " \u003ctd\u003eAgriculture and Agri-Food Canada annually maps...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.0297800228, 0.017559804000000002, -0.02272...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e1\u003c/th\u003e\n", + " \u003ctd\u003eACA/reef_habitat/v2_0\u003c/td\u003e\n", + " \u003ctd\u003eAllen Coral Atlas (ACA) - Geomorphic Zonation ...\u003c/td\u003e\n", + " \u003ctd\u003eThe Allen Coral Atlas is a global, high-resolu...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.00014006280000000002, 0.0595749207, -0.044...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e2\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_INT\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Interpolated\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM is a high-resolution (0.5m) model ...\u003c/td\u003e\n", + " \u003ctd\u003e[0.0002864186, -0.0866151974, -0.0573001616, 0...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e3\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_NON\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Non-Interpolated\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM, created from 2007-2012 LIDAR data...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.015314440200000001, -0.07613136620000001, ...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e4\u003c/th\u003e\n", + " \u003ctd\u003eAHN/AHN2_05M_RUW\u003c/td\u003e\n", + " \u003ctd\u003eAHN Netherlands 0.5m DEM, Raw Samples\u003c/td\u003e\n", + " \u003ctd\u003eThe AHN DEM, a high-resolution (0.5m) elevatio...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.0136169484, -0.1123650596, -0.0680219904, ...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e...\u003c/th\u003e\n", + " \u003ctd\u003e...\u003c/td\u003e\n", + " \u003ctd\u003e...\u003c/td\u003e\n", + " \u003ctd\u003e...\u003c/td\u003e\n", + " \u003ctd\u003e...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e819\u003c/th\u003e\n", + " \u003ctd\u003eprojects/planet-nicfi/assets/basemaps/americas\u003c/td\u003e\n", + " \u003ctd\u003eNICFI Satellite Data Program Basemaps for Trop...\u003c/td\u003e\n", + " \u003ctd\u003e**Concise Summary:**\\n\\nThe Norway's Internati...\u003c/td\u003e\n", + " \u003ctd\u003e[-0.0014470087, 0.028179975200000002, -0.02454...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e820\u003c/th\u003e\n", + " \u003ctd\u003eprojects/planet-nicfi/assets/basemaps/asia\u003c/td\u003e\n", + " \u003ctd\u003eNICFI Satellite Data Program Basemaps for Trop...\u003c/td\u003e\n", + " \u003ctd\u003eThis collection provides high-resolution satel...\u003c/td\u003e\n", + " \u003ctd\u003e[0.0229713377, 0.0144163994, 0.019438674700000...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e821\u003c/th\u003e\n", + " \u003ctd\u003eprojects/sat-io/open-datasets/GLOBathy/GLOBath...\u003c/td\u003e\n", + " \u003ctd\u003eGLOBathy Global lakes bathymetry dataset\u003c/td\u003e\n", + " \u003ctd\u003eGLOBathy offers detailed depth maps of over 1....\u003c/td\u003e\n", + " \u003ctd\u003e[0.009630191100000001, 0.049831219, -0.0843064...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e822\u003c/th\u003e\n", + " \u003ctd\u003eprojects/sat-io/open-datasets/ORNL/LANDSCAN_GL...\u003c/td\u003e\n", + " \u003ctd\u003eLandScan Population Data Global 1km\u003c/td\u003e\n", + " \u003ctd\u003eLandScan, a high-resolution global population ...\u003c/td\u003e\n", + " \u003ctd\u003e[0.0689298883, 0.0050361394, -0.0488965176, -0...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003ctr\u003e\n", + " \u003cth\u003e823\u003c/th\u003e\n", + " \u003ctd\u003eprojects/sat-io/open-datasets/us-drought-monitor\u003c/td\u003e\n", + " \u003ctd\u003eUnited States Drought Monitor\u003c/td\u003e\n", + " \u003ctd\u003eThe U.S. Drought Monitor is a weekly map that ...\u003c/td\u003e\n", + " \u003ctd\u003e[0.0587443635, 0.015232119700000001, -0.032561...\u003c/td\u003e\n", + " \u003c/tr\u003e\n", + " \u003c/tbody\u003e\n", + "\u003c/table\u003e\n", + "\u003cp\u003e824 rows × 4 columns\u003c/p\u003e\n", + "\u003c/div\u003e\n", + " \u003cdiv class=\"colab-df-buttons\"\u003e\n", + "\n", + " \u003cdiv class=\"colab-df-container\"\u003e\n", + " \u003cbutton class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e395cf12-1b3c-4f9e-85dc-46a8cfb3987c')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + " \u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\"\u003e\n", + " \u003cpath d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/\u003e\n", + " \u003c/svg\u003e\n", + " \u003c/button\u003e\n", + "\n", + " \u003cstyle\u003e\n", + " .colab-df-container {\n", + " display:flex;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " .colab-df-buttons div {\n", + " margin-bottom: 4px;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " \u003c/style\u003e\n", + "\n", + " \u003cscript\u003e\n", + " const buttonEl =\n", + " document.querySelector('#df-e395cf12-1b3c-4f9e-85dc-46a8cfb3987c button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-e395cf12-1b3c-4f9e-85dc-46a8cfb3987c');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '\u003ca target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb\u003edata table notebook\u003c/a\u003e'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " \u003c/script\u003e\n", + " \u003c/div\u003e\n", + "\n", + "\n", + "\u003cdiv id=\"df-8b687265-48f5-4f17-8f32-c66df95d7cd7\"\u003e\n", + " \u003cbutton class=\"colab-df-quickchart\" onclick=\"quickchart('df-8b687265-48f5-4f17-8f32-c66df95d7cd7')\"\n", + " title=\"Suggest charts\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + "\u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\"\u003e\n", + " \u003cg\u003e\n", + " \u003cpath d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/\u003e\n", + " \u003c/g\u003e\n", + "\u003c/svg\u003e\n", + " \u003c/button\u003e\n", + "\n", + "\u003cstyle\u003e\n", + " .colab-df-quickchart {\n", + " --bg-color: #E8F0FE;\n", + " --fill-color: #1967D2;\n", + " --hover-bg-color: #E2EBFA;\n", + " --hover-fill-color: #174EA6;\n", + " --disabled-fill-color: #AAA;\n", + " --disabled-bg-color: #DDD;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-quickchart {\n", + " --bg-color: #3B4455;\n", + " --fill-color: #D2E3FC;\n", + " --hover-bg-color: #434B5C;\n", + " --hover-fill-color: #FFFFFF;\n", + " --disabled-bg-color: #3B4455;\n", + " --disabled-fill-color: #666;\n", + " }\n", + "\n", + " .colab-df-quickchart {\n", + " background-color: var(--bg-color);\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: var(--fill-color);\n", + " height: 32px;\n", + " padding: 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-quickchart:hover {\n", + " background-color: var(--hover-bg-color);\n", + " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: var(--button-hover-fill-color);\n", + " }\n", + "\n", + " .colab-df-quickchart-complete:disabled,\n", + " .colab-df-quickchart-complete:disabled:hover {\n", + " background-color: var(--disabled-bg-color);\n", + " fill: var(--disabled-fill-color);\n", + " box-shadow: none;\n", + " }\n", + "\n", + " .colab-df-spinner {\n", + " border: 2px solid var(--fill-color);\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " animation:\n", + " spin 1s steps(1) infinite;\n", + " }\n", + "\n", + " @keyframes spin {\n", + " 0% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " border-left-color: var(--fill-color);\n", + " }\n", + " 20% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 30% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 40% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 60% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 80% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " 90% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " }\n", + "\u003c/style\u003e\n", + "\n", + " \u003cscript\u003e\n", + " async function quickchart(key) {\n", + " const quickchartButtonEl =\n", + " document.querySelector('#' + key + ' button');\n", + " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", + " quickchartButtonEl.classList.add('colab-df-spinner');\n", + " try {\n", + " const charts = await google.colab.kernel.invokeFunction(\n", + " 'suggestCharts', [key], {});\n", + " } catch (error) {\n", + " console.error('Error during call to suggestCharts:', error);\n", + " }\n", + " quickchartButtonEl.classList.remove('colab-df-spinner');\n", + " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", + " }\n", + " (() =\u003e {\n", + " let quickchartButtonEl =\n", + " document.querySelector('#df-8b687265-48f5-4f17-8f32-c66df95d7cd7 button');\n", + " quickchartButtonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + " })();\n", + " \u003c/script\u003e\n", + "\u003c/div\u003e\n", + "\n", + " \u003cdiv id=\"id_06f8b20d-a0df-4505-8b1b-c1f8470fdcec\"\u003e\n", + " \u003cstyle\u003e\n", + " .colab-df-generate {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-generate:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-generate {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-generate:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " \u003c/style\u003e\n", + " \u003cbutton class=\"colab-df-generate\" onclick=\"generateWithVariable('embeddings_df')\"\n", + " title=\"Generate code using this dataframe.\"\n", + " style=\"display:none;\"\u003e\n", + "\n", + " \u003csvg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\"\u003e\n", + " \u003cpath d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/\u003e\n", + " \u003c/svg\u003e\n", + " \u003c/button\u003e\n", + " \u003cscript\u003e\n", + " (() =\u003e {\n", + " const buttonEl =\n", + " document.querySelector('#id_06f8b20d-a0df-4505-8b1b-c1f8470fdcec button.colab-df-generate');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " buttonEl.onclick = () =\u003e {\n", + " google.colab.notebook.generateWithVariable('embeddings_df');\n", + " }\n", + " })();\n", + " \u003c/script\u003e\n", + " \u003c/div\u003e\n", + "\n", + " \u003c/div\u003e\n", + " \u003c/div\u003e\n" + ], + "text/plain": [ + " id \\\n", + "0 AAFC/ACI \n", + "1 ACA/reef_habitat/v2_0 \n", + "2 AHN/AHN2_05M_INT \n", + "3 AHN/AHN2_05M_NON \n", + "4 AHN/AHN2_05M_RUW \n", + ".. ... \n", + "819 projects/planet-nicfi/assets/basemaps/americas \n", + "820 projects/planet-nicfi/assets/basemaps/asia \n", + "821 projects/sat-io/open-datasets/GLOBathy/GLOBath... \n", + "822 projects/sat-io/open-datasets/ORNL/LANDSCAN_GL... \n", + "823 projects/sat-io/open-datasets/us-drought-monitor \n", + "\n", + " name \\\n", + "0 Canada AAFC Annual Crop Inventory \n", + "1 Allen Coral Atlas (ACA) - Geomorphic Zonation ... \n", + "2 AHN Netherlands 0.5m DEM, Interpolated \n", + "3 AHN Netherlands 0.5m DEM, Non-Interpolated \n", + "4 AHN Netherlands 0.5m DEM, Raw Samples \n", + ".. ... \n", + "819 NICFI Satellite Data Program Basemaps for Trop... \n", + "820 NICFI Satellite Data Program Basemaps for Trop... \n", + "821 GLOBathy Global lakes bathymetry dataset \n", + "822 LandScan Population Data Global 1km \n", + "823 United States Drought Monitor \n", + "\n", + " summary \\\n", + "0 Agriculture and Agri-Food Canada annually maps... \n", + "1 The Allen Coral Atlas is a global, high-resolu... \n", + "2 The AHN DEM is a high-resolution (0.5m) model ... \n", + "3 The AHN DEM, created from 2007-2012 LIDAR data... \n", + "4 The AHN DEM, a high-resolution (0.5m) elevatio... \n", + ".. ... \n", + "819 **Concise Summary:**\\n\\nThe Norway's Internati... \n", + "820 This collection provides high-resolution satel... \n", + "821 GLOBathy offers detailed depth maps of over 1.... \n", + "822 LandScan, a high-resolution global population ... \n", + "823 The U.S. Drought Monitor is a weekly map that ... \n", + "\n", + " embedding \n", + "0 [-0.0297800228, 0.017559804000000002, -0.02272... \n", + "1 [-0.00014006280000000002, 0.0595749207, -0.044... \n", + "2 [0.0002864186, -0.0866151974, -0.0573001616, 0... \n", + "3 [-0.015314440200000001, -0.07613136620000001, ... \n", + "4 [-0.0136169484, -0.1123650596, -0.0680219904, ... \n", + ".. ... \n", + "819 [-0.0014470087, 0.028179975200000002, -0.02454... \n", + "820 [0.0229713377, 0.0144163994, 0.019438674700000... \n", + "821 [0.009630191100000001, 0.049831219, -0.0843064... \n", + "822 [0.0689298883, 0.0050361394, -0.0488965176, -0... \n", + "823 [0.0587443635, 0.015232119700000001, -0.032561... \n", + "\n", + "[824 rows x 4 columns]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# @title Make sure we can load the new file from GCS\n", + "\n", + "EMBEDDINGS_CLOUD_PATH = 'gs://science-ai-ee-catalog-index/catalog_embeddings.jsonl'\n", + "EMBEDDINGS_LOCAL_PATH = 'catalog_embeddings.jsonl'\n", + "\n", + "\n", + "parts = EMBEDDINGS_CLOUD_PATH.split('/')\n", + "bucket_name = parts[2]\n", + "blob_path = '/'.join(parts[3:])\n", + "bucket = storage_client.get_bucket(bucket_name)\n", + "blob = bucket.blob(blob_path)\n", + "blob.download_to_filename(EMBEDDINGS_LOCAL_PATH)\n", + "\n", + "embeddings_df = pd.read_json(EMBEDDINGS_LOCAL_PATH, lines=True)\n", + "embeddings_df" + ] + } + ], + "metadata": { + "colab": { + "provenance": [ + { + "file_id": "1ipx2FMYc7C0F98UOC4lgmBusGQlw2Af3", + "timestamp": 1726777405061 + } + ], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}