From 0d4fe3bf3c2cc1ea8791cafdb5e329d9eed32938 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 11 Dec 2024 21:51:40 -0800 Subject: [PATCH 01/70] Add the `refscan` PyPI package (v0.1.22) as a dependency of the Runtime --- requirements/main.in | 3 +++ requirements/main.txt | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/requirements/main.in b/requirements/main.in index 2af8c6e7..9d0e1481 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -37,6 +37,9 @@ python-jose[cryptography] # Reference: https://github.com/microbiomedata/nmdc-runtime/security/dependabot/8 python-multipart>=0.0.18 pyyaml +# Note: We use `refscan` to get information about inter-document references from the schema and database. +# Reference: https://pypi.org/project/refscan/ +refscan==0.1.22 requests semver setuptools-scm diff --git a/requirements/main.txt b/requirements/main.txt index 882d9435..5f683fa5 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -21,8 +21,6 @@ anyio==4.7.0 # jupyter-server # starlette # watchfiles -appnope==0.1.4 - # via ipykernel argon2-cffi==23.1.0 # via jupyter-server argon2-cffi-bindings==21.2.0 @@ -94,6 +92,7 @@ click==8.1.7 # linkml-runtime # mkdocs # prefixcommons + # typer # uvicorn colorama==0.4.6 # via mkdocs-material @@ -196,6 +195,8 @@ graphql-relay==3.2.0 # via graphene graphviz==0.20.3 # via linkml +greenlet==3.1.1 + # via sqlalchemy grpcio==1.68.1 # via # dagster @@ -361,6 +362,7 @@ linkml-runtime==1.8.3 # linkml # linkml-dataops # nmdc-schema + # refscan lxml==5.3.0 # via -r requirements/main.in mako==1.3.7 @@ -563,6 +565,7 @@ pymongo==4.9.2 # -r requirements/main.in # motor # nmdc-schema + # refscan pyparsing==3.2.0 # via rdflib pyshex==0.8.1 @@ -649,6 +652,8 @@ referencing==0.35.1 # jsonschema # jsonschema-specifications # jupyter-events +refscan==0.1.22 + # via -r requirements/main.in regex==2024.11.6 # via mkdocs-material requests==2.32.3 @@ -681,7 +686,10 @@ rfc3986-validator==0.1.1 rfc3987==1.3.8 # via jsonschema rich==13.9.4 - # via dagster + # via + # dagster + # refscan + # typer rpds-py==0.22.3 # via # jsonschema @@ -702,6 +710,8 @@ send2trash==1.8.3 # via jupyter-server setuptools-scm==8.1.0 # via -r requirements/main.in +shellingham==1.5.4 + # via typer shexjsg==0.8.2 # via # pyshex @@ -792,6 +802,8 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat +typer==0.12.5 + # via refscan types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 @@ -810,6 +822,7 @@ typing-extensions==4.12.2 # pydantic-core # rich # sqlalchemy + # typer # uvicorn tzdata==2024.2 # via pandas From a8422aa87a54a0235a0ce6b51ad73c499a5fea0f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 11 Dec 2024 23:12:53 -0800 Subject: [PATCH 02/70] Generate and cache a list of schema-allowed references upon app startup --- nmdc_runtime/api/main.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 46518485..25516d74 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -1,6 +1,7 @@ import os import re from contextlib import asynccontextmanager +from functools import cache from importlib import import_module from importlib.metadata import version from typing import Annotated @@ -13,12 +14,16 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.openapi.docs import get_swagger_ui_html from fastapi.staticfiles import StaticFiles +from linkml_runtime.utils.schemaview import SchemaView +from nmdc_schema.nmdc_data import get_nmdc_schema_definition +from refscan.lib.helpers import identify_references, ReferenceList from setuptools_scm import get_version from starlette import status from starlette.responses import RedirectResponse, HTMLResponse, FileResponse from nmdc_runtime.api.analytics import Analytics from nmdc_runtime.util import ( + collection_name_to_class_names, ensure_unique_id_indexes, REPO_ROOT_DIR, ) @@ -354,11 +359,42 @@ def ensure_default_api_perms(): db["_runtime.api.allow"].create_index("action") +@cache # memoizes the decorated function +def get_allowed_references() -> ReferenceList: + r""" + Returns a `ReferenceList` of all the inter-document references that + the NMDC Schema allows a schema-compliant MongoDB database to contain. + """ + + # Instantiate a LinkML `SchemaView` bound to the NMDC Schema. + schema_view = SchemaView(get_nmdc_schema_definition()) + + # Identify the inter-document references that the schema allows a database to contain. + print("Identifying schema-allowed references.") + references = identify_references( + schema_view=schema_view, + collection_name_to_class_names=collection_name_to_class_names + ) + + return references + + @asynccontextmanager async def lifespan(app: FastAPI): + r""" + Prepares the application to receive requests. + + From the [FastAPI documentation](https://fastapi.tiangolo.com/advanced/events/#lifespan-function): + > You can define logic (code) that should be executed before the application starts up. This means that + > this code will be executed once, before the application starts receiving requests. + + Note: Based on my own observations, I think this function gets called when the first request starts coming in, + but not before that (i.e. not when the application is idle before any requests start coming in). + """ ensure_initial_resources_on_boot() ensure_attribute_indexes() ensure_default_api_perms() + _ = get_allowed_references() # note: future invocations will benefit from the function's memoized-ness yield From 9bc8d1faee2409690fb423c0ce240fc5051aa3a4 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 11 Dec 2024 23:32:20 -0800 Subject: [PATCH 03/70] Get `SchemaView` via existing `util` function instead of independently --- nmdc_runtime/api/main.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 25516d74..fa0beff7 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -14,8 +14,6 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.openapi.docs import get_swagger_ui_html from fastapi.staticfiles import StaticFiles -from linkml_runtime.utils.schemaview import SchemaView -from nmdc_schema.nmdc_data import get_nmdc_schema_definition from refscan.lib.helpers import identify_references, ReferenceList from setuptools_scm import get_version from starlette import status @@ -25,6 +23,7 @@ from nmdc_runtime.util import ( collection_name_to_class_names, ensure_unique_id_indexes, + nmdc_schema_view, REPO_ROOT_DIR, ) from nmdc_runtime.api.core.auth import ( @@ -366,13 +365,10 @@ def get_allowed_references() -> ReferenceList: the NMDC Schema allows a schema-compliant MongoDB database to contain. """ - # Instantiate a LinkML `SchemaView` bound to the NMDC Schema. - schema_view = SchemaView(get_nmdc_schema_definition()) - # Identify the inter-document references that the schema allows a database to contain. print("Identifying schema-allowed references.") references = identify_references( - schema_view=schema_view, + schema_view=nmdc_schema_view(), collection_name_to_class_names=collection_name_to_class_names ) From 7bf29363fca16e10fe8ab11f198d8cfd52d0453f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 00:47:27 -0800 Subject: [PATCH 04/70] Check referential integrity on `/metadata/json:validate` in real time --- nmdc_runtime/api/main.py | 21 +------ nmdc_runtime/util.py | 120 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 20 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index fa0beff7..c443ce9c 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -14,16 +14,14 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.openapi.docs import get_swagger_ui_html from fastapi.staticfiles import StaticFiles -from refscan.lib.helpers import identify_references, ReferenceList from setuptools_scm import get_version from starlette import status from starlette.responses import RedirectResponse, HTMLResponse, FileResponse from nmdc_runtime.api.analytics import Analytics from nmdc_runtime.util import ( - collection_name_to_class_names, + get_allowed_references, ensure_unique_id_indexes, - nmdc_schema_view, REPO_ROOT_DIR, ) from nmdc_runtime.api.core.auth import ( @@ -358,23 +356,6 @@ def ensure_default_api_perms(): db["_runtime.api.allow"].create_index("action") -@cache # memoizes the decorated function -def get_allowed_references() -> ReferenceList: - r""" - Returns a `ReferenceList` of all the inter-document references that - the NMDC Schema allows a schema-compliant MongoDB database to contain. - """ - - # Identify the inter-document references that the schema allows a database to contain. - print("Identifying schema-allowed references.") - references = identify_references( - schema_view=nmdc_schema_view(), - collection_name_to_class_names=collection_name_to_class_names - ) - - return references - - @asynccontextmanager async def lifespan(app: FastAPI): r""" diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index eee078f9..05a1007a 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -22,6 +22,13 @@ from pydantic import Field, BaseModel from pymongo.database import Database as MongoDatabase from pymongo.errors import OperationFailure +from refscan.lib.helpers import ( + derive_schema_class_name_from_document, + identify_references, +) +from refscan.lib.Finder import Finder +from refscan.lib.ReferenceList import ReferenceList +from refscan.lib.Violation import Violation from toolz import merge, unique from nmdc_runtime.api.core.util import sha256hash_from_file @@ -76,6 +83,23 @@ def get_class_names_from_collection_spec( return class_names +@lru_cache +def get_allowed_references() -> ReferenceList: + r""" + Returns a `ReferenceList` of all the inter-document references that + the NMDC Schema allows a schema-compliant MongoDB database to contain. + """ + + # Identify the inter-document references that the schema allows a database to contain. + print("Identifying schema-allowed references.") + references = identify_references( + schema_view=nmdc_schema_view(), + collection_name_to_class_names=collection_name_to_class_names + ) + + return references + + @lru_cache def get_type_collections() -> dict: """Returns a dictionary mapping class names to Mongo collection names.""" @@ -497,6 +521,13 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._bottom_db.client.drop_database(self._top_db.name) + def get_collection(self, coll_name: str): + r"""Returns a reference to the specified collection.""" + try: + return self._top_db[coll_name] + except OperationFailure as e: + raise OverlayDBError(str(e.details)) + def replace_or_insert_many(self, coll_name, documents: list): try: self._top_db[coll_name].insert_many(documents) @@ -548,6 +579,22 @@ def merge_find(self, coll_name, find_spec: dict): def validate_json(in_docs: dict, mdb: MongoDatabase): + r""" + Checks whether the specified dictionary represents a valid instance of the `Database` class + defined in the NMDC Schema. + + Example dictionary: + { + "biosample_set": [ + {"id": "nmdc:bsm-00-000001", ...}, + {"id": "nmdc:bsm-00-000002", ...} + ], + "study_set": [ + {"id": "nmdc:sty-00-000001", ...}, + {"id": "nmdc:sty-00-000002", ...} + ] + } + """ validator = Draft7Validator(get_nmdc_jsonschema_dict()) docs = deepcopy(in_docs) validation_errors = {} @@ -576,6 +623,79 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): try: with OverlayDB(mdb) as odb: odb.replace_or_insert_many(coll_name, coll_docs) + + # Check the referential integrity of the replaced or inserted documents. + # + # Note: If documents being inserted into the _current_ collection + # refer to documents being inserted into a _different_ collection + # as part of the same `in_docs` argument, this check will _not_ + # find the latter documents. + # + # TODO: Enhance this referential integrity validation to account for the + # total of all operations; not just a single collection's operations. + # + # Note: Much of this code was copy/pasted from refscan, at: + # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 + # + source_collection_name = coll_name # creates an alias to accommodate the copy/pasted code + finder = Finder(database=odb) # uses a generic name to accommodate the copy/pasted code + references = get_allowed_references() # uses a generic name to accommodate the copy/pasted code + reference_field_names_by_source_class_name = references.get_reference_field_names_by_source_class_name() + for document in coll_docs: + + # Get the document's schema class name so that we can interpret its fields accordingly. + source_class_name = derive_schema_class_name_from_document( + schema_view=nmdc_schema_view(), + document=document, + ) + + # Get the names of that class's fields that can contain references. + # Get the names of that class's fields that can contain references. + names_of_reference_fields = reference_field_names_by_source_class_name.get(source_class_name, []) + + # Check each field that both (a) exists in the document and (b) can contain a reference. + for field_name in names_of_reference_fields: + if field_name in document: + + # Determine which collections can contain the referenced document, based upon + # the schema class of which this source document is an instance. + target_collection_names = references.get_target_collection_names( + source_class_name=source_class_name, + source_field_name=field_name, + ) + + # Handle both the multi-value (array) and the single-value (scalar) case, + # normalizing the value or values into a list of values in either case. + if type(document[field_name]) is list: + target_ids = document[field_name] + else: + target_id = document[field_name] + target_ids = [target_id] # makes a one-item list + + for target_id in target_ids: + name_of_collection_containing_target_document = ( + finder.check_whether_document_having_id_exists_among_collections( + collection_names=target_collection_names, document_id=target_id + ) + ) + if name_of_collection_containing_target_document is None: + violation = Violation( + source_collection_name=source_collection_name, + source_field_name=field_name, + source_document_object_id=document.get("_id"), + source_document_id=document.get("id"), + target_id=target_id, + name_of_collection_containing_target=None, + ) + violation_as_str = (f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in.") + raise OverlayDBError(violation_as_str) + except OverlayDBError as e: validation_errors[coll_name].append(str(e)) From d8f69d1db9bdca7df6a87ef06e07cbd45c725acf Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 12 Dec 2024 08:49:09 +0000 Subject: [PATCH 05/70] style: reformat --- nmdc_runtime/api/main.py | 4 ++- nmdc_runtime/util.py | 64 ++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index c443ce9c..6498d4cb 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -371,7 +371,9 @@ async def lifespan(app: FastAPI): ensure_initial_resources_on_boot() ensure_attribute_indexes() ensure_default_api_perms() - _ = get_allowed_references() # note: future invocations will benefit from the function's memoized-ness + _ = ( + get_allowed_references() + ) # note: future invocations will benefit from the function's memoized-ness yield diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 05a1007a..8e72689d 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -94,7 +94,7 @@ def get_allowed_references() -> ReferenceList: print("Identifying schema-allowed references.") references = identify_references( schema_view=nmdc_schema_view(), - collection_name_to_class_names=collection_name_to_class_names + collection_name_to_class_names=collection_name_to_class_names, ) return references @@ -638,9 +638,15 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 # source_collection_name = coll_name # creates an alias to accommodate the copy/pasted code - finder = Finder(database=odb) # uses a generic name to accommodate the copy/pasted code - references = get_allowed_references() # uses a generic name to accommodate the copy/pasted code - reference_field_names_by_source_class_name = references.get_reference_field_names_by_source_class_name() + finder = Finder( + database=odb + ) # uses a generic name to accommodate the copy/pasted code + references = ( + get_allowed_references() + ) # uses a generic name to accommodate the copy/pasted code + reference_field_names_by_source_class_name = ( + references.get_reference_field_names_by_source_class_name() + ) for document in coll_docs: # Get the document's schema class name so that we can interpret its fields accordingly. @@ -651,7 +657,11 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # Get the names of that class's fields that can contain references. # Get the names of that class's fields that can contain references. - names_of_reference_fields = reference_field_names_by_source_class_name.get(source_class_name, []) + names_of_reference_fields = ( + reference_field_names_by_source_class_name.get( + source_class_name, [] + ) + ) # Check each field that both (a) exists in the document and (b) can contain a reference. for field_name in names_of_reference_fields: @@ -659,9 +669,11 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. - target_collection_names = references.get_target_collection_names( - source_class_name=source_class_name, - source_field_name=field_name, + target_collection_names = ( + references.get_target_collection_names( + source_class_name=source_class_name, + source_field_name=field_name, + ) ) # Handle both the multi-value (array) and the single-value (scalar) case, @@ -670,30 +682,38 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): target_ids = document[field_name] else: target_id = document[field_name] - target_ids = [target_id] # makes a one-item list + target_ids = [ + target_id + ] # makes a one-item list for target_id in target_ids: - name_of_collection_containing_target_document = ( - finder.check_whether_document_having_id_exists_among_collections( - collection_names=target_collection_names, document_id=target_id - ) + name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( + collection_names=target_collection_names, + document_id=target_id, ) - if name_of_collection_containing_target_document is None: + if ( + name_of_collection_containing_target_document + is None + ): violation = Violation( source_collection_name=source_collection_name, source_field_name=field_name, - source_document_object_id=document.get("_id"), + source_document_object_id=document.get( + "_id" + ), source_document_id=document.get("id"), target_id=target_id, name_of_collection_containing_target=None, ) - violation_as_str = (f"Document '{violation.source_document_id}' " - f"in collection '{violation.source_collection_name}' " - f"has a field '{violation.source_field_name}' that " - f"references a document having id " - f"'{violation.target_id}', but the latter document " - f"does not exist in any of the collections the " - f"NMDC Schema says it can exist in.") + violation_as_str = ( + f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in." + ) raise OverlayDBError(violation_as_str) except OverlayDBError as e: From 82ce38e0cb43feb5548fe2cc74fa9e0c1ee4ee70 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 10:56:44 -0800 Subject: [PATCH 06/70] Remove redundant comment, replicate `refscan` code, and run `black` --- nmdc_runtime/util.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 8e72689d..76e22bbf 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -648,14 +648,12 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): references.get_reference_field_names_by_source_class_name() ) for document in coll_docs: - # Get the document's schema class name so that we can interpret its fields accordingly. source_class_name = derive_schema_class_name_from_document( schema_view=nmdc_schema_view(), document=document, ) - # Get the names of that class's fields that can contain references. # Get the names of that class's fields that can contain references. names_of_reference_fields = ( reference_field_names_by_source_class_name.get( @@ -666,7 +664,6 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): # Check each field that both (a) exists in the document and (b) can contain a reference. for field_name in names_of_reference_fields: if field_name in document: - # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. target_collection_names = ( @@ -703,7 +700,7 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): ), source_document_id=document.get("id"), target_id=target_id, - name_of_collection_containing_target=None, + name_of_collection_containing_target=name_of_collection_containing_target_document, ) violation_as_str = ( f"Document '{violation.source_document_id}' " From ae83dc9e7162756dea4c572ac7d2a9daaa176a17 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 11:39:20 -0800 Subject: [PATCH 07/70] Implement basic automated tests targeting the `validate_json` function --- tests/test_util/README.md | 5 ++ tests/test_util/__init__.py | 0 tests/test_util/test_util.py | 99 ++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 tests/test_util/README.md create mode 100644 tests/test_util/__init__.py create mode 100644 tests/test_util/test_util.py diff --git a/tests/test_util/README.md b/tests/test_util/README.md new file mode 100644 index 00000000..c3746163 --- /dev/null +++ b/tests/test_util/README.md @@ -0,0 +1,5 @@ +This directory contains file related to testing code written in the `nmdc_runtime/util.py` file. + +I named the directory "`test_util`" in an attempt to follow the naming convention of the other test directories. +In its name, "`test`" is a verb and "`util`" is a noun (i.e. "to test the utility"). This is in contrast to the file +`../test_util.py`, in whose name "`test`" is an adjective and "`util`" is a noun (i.e. "a test-related utility"). diff --git a/tests/test_util/__init__.py b/tests/test_util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_util/test_util.py b/tests/test_util/test_util.py new file mode 100644 index 00000000..4419bbe0 --- /dev/null +++ b/tests/test_util/test_util.py @@ -0,0 +1,99 @@ +from nmdc_runtime.api.db.mongo import get_mongo_db +from nmdc_runtime.util import validate_json + +# Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, +# by issuing the following command from the root directory of the repository: +# ``` +# $ pytest tests/test_util/test_util.py +# ``` + + +def test_validate_json(): + # Get a reference to the MongoDB database, since the `validate_json` function requires + # it to be passed in as a parameter. + mdb = get_mongo_db() + + # Define a reusable dictionary that matches the value the `validate_json` function + # returns when it considers the input to be valid. + ok_result = {"result": "All Okay!"} + + # Test: An empty outer dictionary is valid. + database_dict = {} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: An empty collection is valid. + database_dict = {"study_set": []} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: Two empty collections is valid. + database_dict = {"biosample_set": [], "study_set": []} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: A schema-compliant document is valid. + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + } + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: Multiple schema-compliant documents are valid. + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + { + "id": "nmdc:sty-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result == ok_result + + # Test: The function reports an error for the schema-defiant document. + database_dict = { + "study_set": [ + { + "id": "nmdc:OTHER-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 1 + + # Test: The function reports an error for each schema-defiant document. + database_dict = { + "study_set": [ + { + "id": "nmdc:OTHER-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + { + "id": "nmdc:OTHER-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 2 From 7358d8b309e23bfa2eb0c86f51e5edf76e70a9b2 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 11:42:14 -0800 Subject: [PATCH 08/70] Add test demonstrating function behavior with invalid collection name --- tests/test_util/test_util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_util/test_util.py b/tests/test_util/test_util.py index 4419bbe0..6653a1d2 100644 --- a/tests/test_util/test_util.py +++ b/tests/test_util/test_util.py @@ -27,6 +27,13 @@ def test_validate_json(): result = validate_json(in_docs=database_dict, mdb=mdb) assert result == ok_result + # Test: The function reports an error for a schema-defiant collection name. + database_dict = {"OTHER_set": []} + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert "OTHER_set" in result["detail"] + assert len(result["detail"]["OTHER_set"]) == 1 + # Test: Two empty collections is valid. database_dict = {"biosample_set": [], "study_set": []} result = validate_json(in_docs=database_dict, mdb=mdb) From 52b0865d17673844c8b5fa5f52c230feebcd0880 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 12 Dec 2024 12:12:07 -0800 Subject: [PATCH 09/70] Rename directory to work around naming conflict reported by pytest --- tests/test_the_util/README.md | 25 +++++++++++++++++++ .../{test_util => test_the_util}/__init__.py | 0 .../test_the_util.py} | 2 +- tests/test_util/README.md | 5 ---- 4 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 tests/test_the_util/README.md rename tests/{test_util => test_the_util}/__init__.py (100%) rename tests/{test_util/test_util.py => test_the_util/test_the_util.py} (98%) delete mode 100644 tests/test_util/README.md diff --git a/tests/test_the_util/README.md b/tests/test_the_util/README.md new file mode 100644 index 00000000..2c8c24c5 --- /dev/null +++ b/tests/test_the_util/README.md @@ -0,0 +1,25 @@ +This directory contains files related to testing code written in the `nmdc_runtime/util.py` file. + +### Why this directory is not named `test_util` + +I named the directory "`test_the_util`" to work around a limitation of pytest in the context of this repository. + +I tried naming the directory "`test_util`" in an attempt to follow the naming convention of the other test directories. +In its name, "`test`" was a verb and "`util`" was a noun (i.e. "to test the utility"). This was in contrast to the file +`../test_util.py` (a file that was already in the repository), in whose name "`test`" is an adjective and "`util`" is a +noun (i.e. "a test-related utility"). However, with those names in place, `pytest` reported this error: + +```py +_____________________ ERROR collecting tests/test_util.py ______________________ +import file mismatch: +imported module 'tests.test_util' has this __file__ attribute: + /code/tests/test_util +which is not the same as the test file we want to collect: + /code/tests/test_util.py +HINT: remove __pycache__ / .pyc files and/or use a unique basename for your test file modules +``` + +To work around that, I renamed the directory to `test_the_util` and renamed the contained Python file to match. + +That is why the name of this test directory does not follow the naming convention +of the other test directories. diff --git a/tests/test_util/__init__.py b/tests/test_the_util/__init__.py similarity index 100% rename from tests/test_util/__init__.py rename to tests/test_the_util/__init__.py diff --git a/tests/test_util/test_util.py b/tests/test_the_util/test_the_util.py similarity index 98% rename from tests/test_util/test_util.py rename to tests/test_the_util/test_the_util.py index 6653a1d2..ba09d2dd 100644 --- a/tests/test_util/test_util.py +++ b/tests/test_the_util/test_the_util.py @@ -4,7 +4,7 @@ # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, # by issuing the following command from the root directory of the repository: # ``` -# $ pytest tests/test_util/test_util.py +# $ pytest tests/test_the_util/test_the_util.py # ``` diff --git a/tests/test_util/README.md b/tests/test_util/README.md deleted file mode 100644 index c3746163..00000000 --- a/tests/test_util/README.md +++ /dev/null @@ -1,5 +0,0 @@ -This directory contains file related to testing code written in the `nmdc_runtime/util.py` file. - -I named the directory "`test_util`" in an attempt to follow the naming convention of the other test directories. -In its name, "`test`" is a verb and "`util`" is a noun (i.e. "to test the utility"). This is in contrast to the file -`../test_util.py`, in whose name "`test`" is an adjective and "`util`" is a noun (i.e. "a test-related utility"). From 725a6940160d6d735adc7ea2713457a1a42ceb70 Mon Sep 17 00:00:00 2001 From: aclum Date: Tue, 17 Dec 2024 14:20:41 -0800 Subject: [PATCH 10/70] Update changesheet-without-separator3.tsv --- .../notebooks/data/changesheet-without-separator3.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-translation/notebooks/data/changesheet-without-separator3.tsv b/metadata-translation/notebooks/data/changesheet-without-separator3.tsv index b362cc02..cdce2dbe 100644 --- a/metadata-translation/notebooks/data/changesheet-without-separator3.tsv +++ b/metadata-translation/notebooks/data/changesheet-without-separator3.tsv @@ -25,3 +25,4 @@ v4 applied_roles Investigation v4 applies_to_person.name NEW CURATOR NAME 1 v4 applies_to_person.type nmdc:PersonValue v4 type prov:Association +nmdc:sty-11-fkbnah04 insert funding_sources United States National Science Foundation Microbial Observatories program (MCB-0702395)| Long-Term Ecological Research Program (NTL–LTER DEB-1440297)|INSPIRE award (DEB-1344254) From ae0d96a6165c345c5abf18b11375a20e05e54e8d Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 17:43:03 -0800 Subject: [PATCH 11/70] Add test where specified Study has no Biosamples --- tests/test_api/test_endpoints.py | 47 ++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index acfc81bf..47868fd2 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -29,19 +29,23 @@ from nmdc_runtime.util import REPO_ROOT_DIR, ensure_unique_id_indexes -def ensure_schema_collections_and_alldocs(): - # Return if `alldocs` collection has already been materialized. +def ensure_schema_collections_and_alldocs(force_refresh_of_alldocs: bool = False): + r""" + This function can be used to ensure things (?) about schema-described collections and the "alldocs" collection. + + :param bool force_refresh_of_alldocs: Whether you want to force a refresh of the "alldocs" collection, + regardless of whether it is empty of not. By default, this function + will only refresh the "alldocs" collection if it is empty. + """ + + # Return if `alldocs` collection has already been materialized, and caller does not want to force a refresh of it. mdb = get_mongo_db() - if mdb.alldocs.estimated_document_count() > 0: + if mdb.alldocs.estimated_document_count() > 0 and not force_refresh_of_alldocs: print( "ensure_schema_collections_and_alldocs: `alldocs` collection already materialized" ) return - # FIXME: Seed the database with documents that would be included in an `alldocs` collection, - # such that the `/data_objects/study/{study_id}` endpoint (which uses that collection) - # would return some data. Currently, we are practically _not testing_ that endpoint. - ensure_unique_id_indexes(mdb) print("materializing alldocs...") materialize_alldocs( @@ -438,8 +442,6 @@ def test_find_data_objects_for_nonexistent_study(api_site_client): Note: The `api_site_client` fixture's `request` method will raise an exception if the server responds with an unsuccessful status code. - - TODO: Add tests focused on the situation where the `Study` _does_ exist. """ ensure_schema_collections_and_alldocs() with pytest.raises(requests.exceptions.HTTPError): @@ -449,6 +451,33 @@ def test_find_data_objects_for_nonexistent_study(api_site_client): ) +def test_find_data_objects_for_study_having_none(api_site_client): + # Seed the test database with a study having no associated data objects. + mdb = get_mongo_db() + study_id = "nmdc:sty-00-beeeeeef" + study_dict = { + "id": study_id, + "type": "nmdc:Study", + "study_category": "research_study", + } + mdb.get_collection(name="study_set").replace_one( + {"id": study_id}, study_dict, upsert=True + ) + + # Update the `alldocs` collection, which is a cache used by the endpoint under test. + ensure_schema_collections_and_alldocs(force_refresh_of_alldocs=True) + + # Confirm the endpoint responds with no data objects. + response = api_site_client.request("GET", f"/data_objects/study/{study_id}") + assert response.status_code == 200 + data_objects_by_biosample = response.json() + assert len(data_objects_by_biosample) == 0 + + # Clean up: Delete the documents we created within this test, from the database. + mdb.get_collection(name="study_set").delete_one({"id": study_id}) + mdb.get_collection(name="alldocs").delete_many({}) + + def test_find_planned_processes(api_site_client): mdb = get_mongo_db() database_dict = json.loads( From dd17e34628b207d3d42619b7c649bf9e6e6af93e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 17:43:46 -0800 Subject: [PATCH 12/70] Document time-saving tip about pointing pytest at a specific module --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index a3509b69..f458db60 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,8 @@ test-dbinit: docker compose --file docker-compose.test.yml \ exec mongo /bin/bash -c "/mongorestore-nmdc-testdb.sh" +# Tip: If you append a file path to this "recipe", pytest will run only the tests defined in that file. +# For example, append `tests/test_api/test_endpoints.py` to have pytest only run the endpoint tests. test-run: docker compose --file docker-compose.test.yml run test From 1877587daf2958a3e8c5e22f8f03bbd04b568550 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 18:11:17 -0800 Subject: [PATCH 13/70] Add test where specified Study has 1 DataObject via 1 MassSpectrometry --- tests/test_api/test_endpoints.py | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 47868fd2..b2ace4f2 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -478,6 +478,81 @@ def test_find_data_objects_for_study_having_none(api_site_client): mdb.get_collection(name="alldocs").delete_many({}) +def test_find_data_objects_for_study_having_one(api_site_client): + # Seed the test database with a study having one associated data object. + mdb = get_mongo_db() + study_id = "nmdc:sty-00-studio" + study_dict = { + "id": study_id, + "type": "nmdc:Study", + "study_category": "research_study", + } + mdb.get_collection(name="study_set").replace_one( + {"id": study_id}, study_dict, upsert=True + ) + biosample_id = "nmdc:bsm-00-campione" + biosample_dict = { + "id": biosample_id, + "type": "nmdc:Biosample", + "associated_studies": [study_id], + "env_broad_scale": { + "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "type": "nmdc:ControlledIdentifiedTermValue" + }, + "env_local_scale": { + "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "type": "nmdc:ControlledIdentifiedTermValue" + }, + "env_medium": { + "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "type": "nmdc:ControlledIdentifiedTermValue" + } + } + mdb.get_collection(name="biosample_set").replace_one( + {"id": biosample_id}, biosample_dict, upsert=True + ) + data_object_id = "nmdc:dobj-00-oggetto" + data_object_dict = { + "id": data_object_id, + "name": "Some name", + "description": "Some description", + "type": "nmdc:DataObject", + } + mdb.get_collection(name="data_object_set").replace_one( + {"id": data_object_id}, data_object_dict, upsert=True + ) + # Note: The `MassSpectrometry` class inherits from the (abstract) `DataGeneration` class. + # Reference: https://microbiomedata.github.io/nmdc-schema/MassSpectrometry/ + mass_spectrometry_id = "nmdc:dgms-00-spettro" + mass_spectrometry_dict = { + "id": mass_spectrometry_id, + "type": "nmdc:MassSpectrometry", + "analyte_category": "metaproteome", + "associated_studies": [study_id], + "has_input": [biosample_id], + "has_output": [data_object_id], + } + mdb.get_collection(name="data_generation_set").replace_one( + {"id": mass_spectrometry_id}, mass_spectrometry_dict, upsert=True + ) + + # Update the `alldocs` collection, which is a cache used by the endpoint under test. + ensure_schema_collections_and_alldocs(force_refresh_of_alldocs=True) + + # Confirm the endpoint responds with no data objects. + response = api_site_client.request("GET", f"/data_objects/study/{study_id}") + assert response.status_code == 200 + data_objects_by_biosample = response.json() + assert len(data_objects_by_biosample) == 1 + + # Clean up: Delete the documents we created within this test, from the database. + mdb.get_collection(name="study_set").delete_one({"id": study_id}) + mdb.get_collection(name="biosample_set").delete_one({"id": biosample_id}) + mdb.get_collection(name="data_generation_set").delete_one({"id": mass_spectrometry_id}) + mdb.get_collection(name="data_object_set").delete_one({"id": data_object_id}) + mdb.get_collection(name="alldocs").delete_many({}) + + def test_find_planned_processes(api_site_client): mdb = get_mongo_db() database_dict = json.loads( From ebb037c37fdd9946861b16a630c8cc693b528754 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 18:25:54 -0800 Subject: [PATCH 14/70] Assert more details about the API response payload --- tests/test_api/test_endpoints.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index b2ace4f2..24114d6d 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -544,6 +544,9 @@ def test_find_data_objects_for_study_having_one(api_site_client): assert response.status_code == 200 data_objects_by_biosample = response.json() assert len(data_objects_by_biosample) == 1 + assert data_objects_by_biosample[0]["biosample_id"] == biosample_id + assert len(data_objects_by_biosample[0]["data_objects"]) == 1 + assert data_objects_by_biosample[0]["data_objects"][0]["id"] == data_object_id # Clean up: Delete the documents we created within this test, from the database. mdb.get_collection(name="study_set").delete_one({"id": study_id}) From ca82e4283ca5086459ba943853cda4cecfdef61e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 18:26:40 -0800 Subject: [PATCH 15/70] Fix inaccurate comment --- tests/test_api/test_endpoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 24114d6d..f68e2330 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -539,7 +539,7 @@ def test_find_data_objects_for_study_having_one(api_site_client): # Update the `alldocs` collection, which is a cache used by the endpoint under test. ensure_schema_collections_and_alldocs(force_refresh_of_alldocs=True) - # Confirm the endpoint responds with no data objects. + # Confirm the endpoint responds with the data object we inserted above. response = api_site_client.request("GET", f"/data_objects/study/{study_id}") assert response.status_code == 200 data_objects_by_biosample = response.json() From 9907a5b785fb244120389becb296016f6de2c21d Mon Sep 17 00:00:00 2001 From: eecavanna <134325062+eecavanna@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:58:18 -0800 Subject: [PATCH 16/70] Update changesheet to target `Study` that exists in test database --- .../notebooks/data/changesheet-without-separator3.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-translation/notebooks/data/changesheet-without-separator3.tsv b/metadata-translation/notebooks/data/changesheet-without-separator3.tsv index cdce2dbe..7e4f24cd 100644 --- a/metadata-translation/notebooks/data/changesheet-without-separator3.tsv +++ b/metadata-translation/notebooks/data/changesheet-without-separator3.tsv @@ -25,4 +25,4 @@ v4 applied_roles Investigation v4 applies_to_person.name NEW CURATOR NAME 1 v4 applies_to_person.type nmdc:PersonValue v4 type prov:Association -nmdc:sty-11-fkbnah04 insert funding_sources United States National Science Foundation Microbial Observatories program (MCB-0702395)| Long-Term Ecological Research Program (NTL–LTER DEB-1440297)|INSPIRE award (DEB-1344254) +nmdc:sty-11-pzmd0x14 insert funding_sources United States National Science Foundation Microbial Observatories program (MCB-0702395)| Long-Term Ecological Research Program (NTL–LTER DEB-1440297)|INSPIRE award (DEB-1344254) From 92ffe182d4fbc1536a0b8de392bd1f9469196cc1 Mon Sep 17 00:00:00 2001 From: eecavanna <134325062+eecavanna@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:58:49 -0800 Subject: [PATCH 17/70] Remove leading whitespace from list item in example changesheet --- .../notebooks/data/changesheet-without-separator3.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-translation/notebooks/data/changesheet-without-separator3.tsv b/metadata-translation/notebooks/data/changesheet-without-separator3.tsv index 7e4f24cd..2e0c1e4e 100644 --- a/metadata-translation/notebooks/data/changesheet-without-separator3.tsv +++ b/metadata-translation/notebooks/data/changesheet-without-separator3.tsv @@ -25,4 +25,4 @@ v4 applied_roles Investigation v4 applies_to_person.name NEW CURATOR NAME 1 v4 applies_to_person.type nmdc:PersonValue v4 type prov:Association -nmdc:sty-11-pzmd0x14 insert funding_sources United States National Science Foundation Microbial Observatories program (MCB-0702395)| Long-Term Ecological Research Program (NTL–LTER DEB-1440297)|INSPIRE award (DEB-1344254) +nmdc:sty-11-pzmd0x14 insert funding_sources United States National Science Foundation Microbial Observatories program (MCB-0702395)|Long-Term Ecological Research Program (NTL–LTER DEB-1440297)|INSPIRE award (DEB-1344254) From a7e73c99947cf194cc46dd83abfc0f102900be75 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 20:20:47 -0800 Subject: [PATCH 18/70] Check referential integrity after processing _all_ specified collections Previously, we were checking it after inserting documents into _each_ specified collection, which made it so we would not know whether a referenced document would have been inserted into a later collection. --- nmdc_runtime/util.py | 199 +++++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 91 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 76e22bbf..d242c31e 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -578,7 +578,7 @@ def merge_find(self, coll_name, find_spec: dict): yield doc -def validate_json(in_docs: dict, mdb: MongoDatabase): +def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = True): r""" Checks whether the specified dictionary represents a valid instance of the `Database` class defined in the NMDC Schema. @@ -594,6 +594,13 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): {"id": "nmdc:sty-00-000002", ...} ] } + + :param dict in_docs: The dictionary you want to validate + :param MongoDatabase mdb: A reference to a MongoDB database + :param bool check_references: Whether you want this function to check whether every document that is referenced + by any of the documents passed in would, indeed, exist in the database, if the + documents passed in were to be inserted into the database. In other words, set this + to `True` if you want this function to perform referential integrity checks. """ validator = Draft7Validator(get_nmdc_jsonschema_dict()) docs = deepcopy(in_docs) @@ -623,96 +630,6 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): try: with OverlayDB(mdb) as odb: odb.replace_or_insert_many(coll_name, coll_docs) - - # Check the referential integrity of the replaced or inserted documents. - # - # Note: If documents being inserted into the _current_ collection - # refer to documents being inserted into a _different_ collection - # as part of the same `in_docs` argument, this check will _not_ - # find the latter documents. - # - # TODO: Enhance this referential integrity validation to account for the - # total of all operations; not just a single collection's operations. - # - # Note: Much of this code was copy/pasted from refscan, at: - # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 - # - source_collection_name = coll_name # creates an alias to accommodate the copy/pasted code - finder = Finder( - database=odb - ) # uses a generic name to accommodate the copy/pasted code - references = ( - get_allowed_references() - ) # uses a generic name to accommodate the copy/pasted code - reference_field_names_by_source_class_name = ( - references.get_reference_field_names_by_source_class_name() - ) - for document in coll_docs: - # Get the document's schema class name so that we can interpret its fields accordingly. - source_class_name = derive_schema_class_name_from_document( - schema_view=nmdc_schema_view(), - document=document, - ) - - # Get the names of that class's fields that can contain references. - names_of_reference_fields = ( - reference_field_names_by_source_class_name.get( - source_class_name, [] - ) - ) - - # Check each field that both (a) exists in the document and (b) can contain a reference. - for field_name in names_of_reference_fields: - if field_name in document: - # Determine which collections can contain the referenced document, based upon - # the schema class of which this source document is an instance. - target_collection_names = ( - references.get_target_collection_names( - source_class_name=source_class_name, - source_field_name=field_name, - ) - ) - - # Handle both the multi-value (array) and the single-value (scalar) case, - # normalizing the value or values into a list of values in either case. - if type(document[field_name]) is list: - target_ids = document[field_name] - else: - target_id = document[field_name] - target_ids = [ - target_id - ] # makes a one-item list - - for target_id in target_ids: - name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( - collection_names=target_collection_names, - document_id=target_id, - ) - if ( - name_of_collection_containing_target_document - is None - ): - violation = Violation( - source_collection_name=source_collection_name, - source_field_name=field_name, - source_document_object_id=document.get( - "_id" - ), - source_document_id=document.get("id"), - target_id=target_id, - name_of_collection_containing_target=name_of_collection_containing_target_document, - ) - violation_as_str = ( - f"Document '{violation.source_document_id}' " - f"in collection '{violation.source_collection_name}' " - f"has a field '{violation.source_field_name}' that " - f"references a document having id " - f"'{violation.target_id}', but the latter document " - f"does not exist in any of the collections the " - f"NMDC Schema says it can exist in." - ) - raise OverlayDBError(violation_as_str) - except OverlayDBError as e: validation_errors[coll_name].append(str(e)) @@ -724,6 +641,106 @@ def validate_json(in_docs: dict, mdb: MongoDatabase): except Exception as e: return {"result": "errors", "detail": str(e)} + # Third pass (if enabled): Check inter-document references. + if check_references is True: + # Insert all documents specified for all collections specified, into the OverlayDB. + # + # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, + # instead, validate it after processing _each_ collection, we would get a false positive if a document + # inserted into an earlier-processed collection happened to reference a document slated for insertion + # into a later-processed collection. By waiting until all documents in all collections specified have + # been inserted, we avoid that scenario. + # + with OverlayDB(mdb) as overlay_db: + print(f"Inserting documents into the OverlayDB.") + for collection_name, documents_to_insert in docs.items(): + try: + overlay_db.replace_or_insert_many(collection_name, documents_to_insert) + except OverlayDBError as error: + validation_errors[collection_name].append(str(error)) + + # Now that the OverlayDB contains all the specified documents, we will check whether + # every document referenced by any of the inserted documents exists. + finder = Finder(database=overlay_db) + references = get_allowed_references() + reference_field_names_by_source_class_name = ( + references.get_reference_field_names_by_source_class_name() + ) + for source_collection_name, documents_inserted in docs.items(): + # Check the referential integrity of the replaced or inserted documents. + # + # Note: Much of this code was copy/pasted from refscan, at: + # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 + # + print(f"Checking references emanating from documents inserted into '{source_collection_name}'.") + for document in documents_inserted: + # Get the document's schema class name so that we can interpret its fields accordingly. + source_class_name = derive_schema_class_name_from_document( + schema_view=nmdc_schema_view(), + document=document, + ) + + # Get the names of that class's fields that can contain references. + names_of_reference_fields = ( + reference_field_names_by_source_class_name.get( + source_class_name, [] + ) + ) + + # Check each field that both (a) exists in the document and (b) can contain a reference. + for field_name in names_of_reference_fields: + if field_name in document: + # Determine which collections can contain the referenced document, based upon + # the schema class of which this source document is an instance. + target_collection_names = ( + references.get_target_collection_names( + source_class_name=source_class_name, + source_field_name=field_name, + ) + ) + + # Handle both the multi-value (array) and the single-value (scalar) case, + # normalizing the value or values into a list of values in either case. + if type(document[field_name]) is list: + target_ids = document[field_name] + else: + target_id = document[field_name] + target_ids = [ + target_id + ] # makes a one-item list + + for target_id in target_ids: + name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( + collection_names=target_collection_names, + document_id=target_id, + ) + if ( + name_of_collection_containing_target_document + is None + ): + violation = Violation( + source_collection_name=source_collection_name, + source_field_name=field_name, + source_document_object_id=document.get("_id"), + source_document_id=document.get("id"), + target_id=target_id, + name_of_collection_containing_target=name_of_collection_containing_target_document, + ) + violation_as_str = ( + f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in." + ) + validation_errors[source_collection_name].append(violation_as_str) + + # If any collection's error list is not empty, return an error response. + if any(len(v) > 0 for v in validation_errors.values()): + return {"result": "errors", "detail": validation_errors} + return {"result": "All Okay!"} else: return {"result": "errors", "detail": validation_errors} From 3389a4dab1a044e3272cc70f556a5aea93c97ee7 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 18 Dec 2024 04:21:15 +0000 Subject: [PATCH 19/70] style: reformat --- nmdc_runtime/util.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index d242c31e..81aa8bcd 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -655,7 +655,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr print(f"Inserting documents into the OverlayDB.") for collection_name, documents_to_insert in docs.items(): try: - overlay_db.replace_or_insert_many(collection_name, documents_to_insert) + overlay_db.replace_or_insert_many( + collection_name, documents_to_insert + ) except OverlayDBError as error: validation_errors[collection_name].append(str(error)) @@ -672,7 +674,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr # Note: Much of this code was copy/pasted from refscan, at: # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 # - print(f"Checking references emanating from documents inserted into '{source_collection_name}'.") + print( + f"Checking references emanating from documents inserted into '{source_collection_name}'." + ) for document in documents_inserted: # Get the document's schema class name so that we can interpret its fields accordingly. source_class_name = derive_schema_class_name_from_document( @@ -705,9 +709,7 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr target_ids = document[field_name] else: target_id = document[field_name] - target_ids = [ - target_id - ] # makes a one-item list + target_ids = [target_id] # makes a one-item list for target_id in target_ids: name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( @@ -721,7 +723,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr violation = Violation( source_collection_name=source_collection_name, source_field_name=field_name, - source_document_object_id=document.get("_id"), + source_document_object_id=document.get( + "_id" + ), source_document_id=document.get("id"), target_id=target_id, name_of_collection_containing_target=name_of_collection_containing_target_document, @@ -735,7 +739,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Tr f"does not exist in any of the collections the " f"NMDC Schema says it can exist in." ) - validation_errors[source_collection_name].append(violation_as_str) + validation_errors[ + source_collection_name + ].append(violation_as_str) # If any collection's error list is not empty, return an error response. if any(len(v) > 0 for v in validation_errors.values()): From bfd880f96829ea27db19a23e9dcb328d7308aebf Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 17 Dec 2024 20:21:31 -0800 Subject: [PATCH 20/70] Disable referential integrity checking in `validate_json` by default --- nmdc_runtime/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 81aa8bcd..780ec865 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -578,7 +578,7 @@ def merge_find(self, coll_name, find_spec: dict): yield doc -def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = True): +def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = False): r""" Checks whether the specified dictionary represents a valid instance of the `Database` class defined in the NMDC Schema. From e520bd0cf789f357a93701652b0e315bee50a6f2 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 19 Dec 2024 19:20:53 +0100 Subject: [PATCH 21/70] style: format --- tests/test_api/test_endpoints.py | 36 +++++++++++++++++--------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index f68e2330..005d7030 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -492,22 +492,22 @@ def test_find_data_objects_for_study_having_one(api_site_client): ) biosample_id = "nmdc:bsm-00-campione" biosample_dict = { - "id": biosample_id, - "type": "nmdc:Biosample", - "associated_studies": [study_id], - "env_broad_scale": { - "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, - "type": "nmdc:ControlledIdentifiedTermValue" - }, - "env_local_scale": { - "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, - "type": "nmdc:ControlledIdentifiedTermValue" - }, - "env_medium": { - "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, - "type": "nmdc:ControlledIdentifiedTermValue" - } - } + "id": biosample_id, + "type": "nmdc:Biosample", + "associated_studies": [study_id], + "env_broad_scale": { + "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "type": "nmdc:ControlledIdentifiedTermValue", + }, + "env_local_scale": { + "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "type": "nmdc:ControlledIdentifiedTermValue", + }, + "env_medium": { + "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "type": "nmdc:ControlledIdentifiedTermValue", + }, + } mdb.get_collection(name="biosample_set").replace_one( {"id": biosample_id}, biosample_dict, upsert=True ) @@ -551,7 +551,9 @@ def test_find_data_objects_for_study_having_one(api_site_client): # Clean up: Delete the documents we created within this test, from the database. mdb.get_collection(name="study_set").delete_one({"id": study_id}) mdb.get_collection(name="biosample_set").delete_one({"id": biosample_id}) - mdb.get_collection(name="data_generation_set").delete_one({"id": mass_spectrometry_id}) + mdb.get_collection(name="data_generation_set").delete_one( + {"id": mass_spectrometry_id} + ) mdb.get_collection(name="data_object_set").delete_one({"id": data_object_id}) mdb.get_collection(name="alldocs").delete_many({}) From 48ec4bf83bb2da0a16c702472e4c97646ce6b9e9 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 19 Dec 2024 19:24:28 +0100 Subject: [PATCH 22/70] feat: clarify docstring; ensure indexes even if no refresh --- tests/test_api/test_endpoints.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 005d7030..fb8df1d5 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -31,22 +31,21 @@ def ensure_schema_collections_and_alldocs(force_refresh_of_alldocs: bool = False): r""" - This function can be used to ensure things (?) about schema-described collections and the "alldocs" collection. + This function can be used to ensure properties of schema-described collections and the "alldocs" collection. :param bool force_refresh_of_alldocs: Whether you want to force a refresh of the "alldocs" collection, regardless of whether it is empty of not. By default, this function will only refresh the "alldocs" collection if it is empty. """ - - # Return if `alldocs` collection has already been materialized, and caller does not want to force a refresh of it. mdb = get_mongo_db() + ensure_unique_id_indexes(mdb) + # Return if `alldocs` collection has already been materialized, and caller does not want to force a refresh of it. if mdb.alldocs.estimated_document_count() > 0 and not force_refresh_of_alldocs: print( "ensure_schema_collections_and_alldocs: `alldocs` collection already materialized" ) return - ensure_unique_id_indexes(mdb) print("materializing alldocs...") materialize_alldocs( build_op_context( From 59817c14e2e56fcaad48bf0c8cd27124a68b29d2 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 19 Dec 2024 19:24:53 +0100 Subject: [PATCH 23/70] Update tests/test_api/test_endpoints.py Co-authored-by: eecavanna <134325062+eecavanna@users.noreply.github.com> --- tests/test_api/test_endpoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index fb8df1d5..d7e62d8e 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -34,7 +34,7 @@ def ensure_schema_collections_and_alldocs(force_refresh_of_alldocs: bool = False This function can be used to ensure properties of schema-described collections and the "alldocs" collection. :param bool force_refresh_of_alldocs: Whether you want to force a refresh of the "alldocs" collection, - regardless of whether it is empty of not. By default, this function + regardless of whether it is empty or not. By default, this function will only refresh the "alldocs" collection if it is empty. """ mdb = get_mongo_db() From d7f6a53f6ff65f6b2a70b2781012a8def5e7843e Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 19 Dec 2024 20:07:35 +0100 Subject: [PATCH 24/70] feat: use real example metadata --- Makefile | 2 +- tests/test_api/test_endpoints.py | 116 +++++++++++++++++++++---------- 2 files changed, 79 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index f458db60..3089f211 100644 --- a/Makefile +++ b/Makefile @@ -103,7 +103,7 @@ mongorestore-nmdc-db: mkdir -p /tmp/remote-mongodump/nmdc # Optionally, manually update MONGO_REMOTE_DUMP_DIR env var: # ```bash - # export MONGO_REMOTE_DUMP_DIR=$(ssh -i ~/.ssh/nersc -q ${NERSC_USERNAME}@dtn01.nersc.gov 'bash -s ' < get_latest_nmdc_prod_dump_dir.sh 2>/dev/null) + # export MONGO_REMOTE_DUMP_DIR=$(ssh -i ~/.ssh/nersc -q ${NERSC_USERNAME}@dtn01.nersc.gov 'bash -s ' < util/get_latest_nmdc_prod_dump_dir.sh 2>/dev/null) # ``` # Rsync the remote dump directory items of interest: rsync -av --exclude='_*' --exclude='fs\.*' \ diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index d7e62d8e..148408a6 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -26,7 +26,7 @@ mongo_resource, RuntimeApiUserClient, ) -from nmdc_runtime.util import REPO_ROOT_DIR, ensure_unique_id_indexes +from nmdc_runtime.util import REPO_ROOT_DIR, ensure_unique_id_indexes, validate_json def ensure_schema_collections_and_alldocs(force_refresh_of_alldocs: bool = False): @@ -459,6 +459,8 @@ def test_find_data_objects_for_study_having_none(api_site_client): "type": "nmdc:Study", "study_category": "research_study", } + assert validate_json({"study_set": [study_dict]}, mdb)["result"] != "errors" + mdb.get_collection(name="study_set").replace_one( {"id": study_id}, study_dict, upsert=True ) @@ -480,60 +482,93 @@ def test_find_data_objects_for_study_having_none(api_site_client): def test_find_data_objects_for_study_having_one(api_site_client): # Seed the test database with a study having one associated data object. mdb = get_mongo_db() - study_id = "nmdc:sty-00-studio" + study_id = "nmdc:sty-11-r2h77870" study_dict = { "id": study_id, "type": "nmdc:Study", "study_category": "research_study", } - mdb.get_collection(name="study_set").replace_one( - {"id": study_id}, study_dict, upsert=True - ) - biosample_id = "nmdc:bsm-00-campione" + fakes = set() + assert validate_json({"study_set": [study_dict]}, mdb)["result"] != "errors" + if mdb.get_collection(name="study_set").find_one({"id": study_id}) is None: + mdb.get_collection(name="study_set").insert_one(study_dict) + fakes.add("study") + biosample_id = "nmdc:bsm-11-6zd5nb38" biosample_dict = { "id": biosample_id, - "type": "nmdc:Biosample", - "associated_studies": [study_id], "env_broad_scale": { - "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "has_raw_value": "ENVO_00000446", + "term": { + "id": "ENVO:00000446", + "name": "terrestrial biome", + "type": "nmdc:OntologyClass", + }, "type": "nmdc:ControlledIdentifiedTermValue", }, "env_local_scale": { - "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "has_raw_value": "ENVO_00005801", + "term": { + "id": "ENVO:00005801", + "name": "rhizosphere", + "type": "nmdc:OntologyClass", + }, "type": "nmdc:ControlledIdentifiedTermValue", }, "env_medium": { - "term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, + "has_raw_value": "ENVO_00001998", + "term": { + "id": "ENVO:00001998", + "name": "soil", + "type": "nmdc:OntologyClass", + }, "type": "nmdc:ControlledIdentifiedTermValue", }, + "type": "nmdc:Biosample", + "associated_studies": [study_id], } - mdb.get_collection(name="biosample_set").replace_one( - {"id": biosample_id}, biosample_dict, upsert=True + assert validate_json({"biosample_set": [biosample_dict]}, mdb)["result"] != "errors" + if mdb.get_collection(name="biosample_set").find_one({"id": biosample_id}) is None: + mdb.get_collection(name="biosample_set").insert_one(biosample_dict) + fakes.add("biosample") + + data_generation_id = "nmdc:omprc-11-nmtj1g51" + data_generation_dict = { + "id": data_generation_id, + "has_input": [biosample_id], + "type": "nmdc:NucleotideSequencing", + "analyte_category": "metagenome", + "associated_studies": [study_id], + } + assert ( + validate_json({"data_generation_set": [data_generation_dict]}, mdb)["result"] + != "errors" ) - data_object_id = "nmdc:dobj-00-oggetto" + if ( + mdb.get_collection(name="data_generation_set").find_one( + {"id": data_generation_id} + ) + is None + ): + mdb.get_collection(name="data_generation_set").insert_one(data_generation_dict) + fakes.add("data_generation") + + data_object_id = "nmdc:dobj-11-cpv4y420" data_object_dict = { "id": data_object_id, - "name": "Some name", - "description": "Some description", + "name": "Raw sequencer read data", + "description": "Metagenome Raw Reads for nmdc:omprc-11-nmtj1g51", "type": "nmdc:DataObject", } - mdb.get_collection(name="data_object_set").replace_one( - {"id": data_object_id}, data_object_dict, upsert=True - ) - # Note: The `MassSpectrometry` class inherits from the (abstract) `DataGeneration` class. - # Reference: https://microbiomedata.github.io/nmdc-schema/MassSpectrometry/ - mass_spectrometry_id = "nmdc:dgms-00-spettro" - mass_spectrometry_dict = { - "id": mass_spectrometry_id, - "type": "nmdc:MassSpectrometry", - "analyte_category": "metaproteome", - "associated_studies": [study_id], - "has_input": [biosample_id], - "has_output": [data_object_id], - } - mdb.get_collection(name="data_generation_set").replace_one( - {"id": mass_spectrometry_id}, mass_spectrometry_dict, upsert=True + assert ( + validate_json({"data_object_set": [data_object_dict]}, mdb)["result"] + != "errors" ) + if ( + mdb.get_collection(name="data_object_set").find_one({"id": data_object_id}) + is None + ): + mdb.get_collection(name="data_object_set").insert_one(data_object_dict) + fakes.add("data_object") # Update the `alldocs` collection, which is a cache used by the endpoint under test. ensure_schema_collections_and_alldocs(force_refresh_of_alldocs=True) @@ -548,12 +583,17 @@ def test_find_data_objects_for_study_having_one(api_site_client): assert data_objects_by_biosample[0]["data_objects"][0]["id"] == data_object_id # Clean up: Delete the documents we created within this test, from the database. - mdb.get_collection(name="study_set").delete_one({"id": study_id}) - mdb.get_collection(name="biosample_set").delete_one({"id": biosample_id}) - mdb.get_collection(name="data_generation_set").delete_one( - {"id": mass_spectrometry_id} - ) - mdb.get_collection(name="data_object_set").delete_one({"id": data_object_id}) + if "study" in fakes: + mdb.get_collection(name="study_set").delete_one({"id": study_id}) + if "biosample" in fakes: + mdb.get_collection(name="biosample_set").delete_one({"id": biosample_id}) + if "data_generation": + mdb.get_collection(name="data_generation_set").delete_one( + {"id": data_generation_id} + ) + if "data_object" in fakes: + mdb.get_collection(name="data_object_set").delete_one({"id": data_object_id}) + mdb.get_collection(name="alldocs").delete_many({}) From 13585782f2996db0ab5d71f77bfdfa927ac58b51 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 19 Dec 2024 20:25:39 +0100 Subject: [PATCH 25/70] fix: more resilient assertion --- tests/test_api/test_endpoints.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 148408a6..f525cb45 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -577,10 +577,13 @@ def test_find_data_objects_for_study_having_one(api_site_client): response = api_site_client.request("GET", f"/data_objects/study/{study_id}") assert response.status_code == 200 data_objects_by_biosample = response.json() - assert len(data_objects_by_biosample) == 1 - assert data_objects_by_biosample[0]["biosample_id"] == biosample_id - assert len(data_objects_by_biosample[0]["data_objects"]) == 1 - assert data_objects_by_biosample[0]["data_objects"][0]["id"] == data_object_id + assert any( + biosample_data_objects["biosample_id"] == biosample_id + and any( + do["id"] == data_object_id for do in biosample_data_objects["data_objects"] + ) + for biosample_data_objects in data_objects_by_biosample + ) # Clean up: Delete the documents we created within this test, from the database. if "study" in fakes: From d98ddfac7cb913cb8d0d50c681bc2d54631a00c2 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 19 Dec 2024 21:25:53 +0100 Subject: [PATCH 26/70] fix: add workflow_execution --- tests/test_api/test_endpoints.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index f525cb45..26e5ce66 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -570,6 +570,35 @@ def test_find_data_objects_for_study_having_one(api_site_client): mdb.get_collection(name="data_object_set").insert_one(data_object_dict) fakes.add("data_object") + workflow_execution_id = "nmdc:wfmsa-11-fqq66x60.1" + workflow_execution_dict = { + "id": workflow_execution_id, + "started_at_time": "2023-03-24T02:02:59.479107+00:00", + "ended_at_time": "2023-03-24T02:02:59.479129+00:00", + "was_informed_by": data_generation_id, + "execution_resource": "JGI", + "git_url": "https://github.com/microbiomedata/RawSequencingData", + "has_input": [biosample_id], + "has_output": [data_object_id], + "type": "nmdc:MetagenomeSequencing", + } + assert ( + validate_json({"workflow_execution_set": [workflow_execution_dict]}, mdb)[ + "result" + ] + != "errors" + ) + if ( + mdb.get_collection(name="workflow_execution_set").find_one( + {"id": workflow_execution_id} + ) + is None + ): + mdb.get_collection(name="workflow_execution_set").insert_one( + workflow_execution_dict + ) + fakes.add("workflow_execution") + # Update the `alldocs` collection, which is a cache used by the endpoint under test. ensure_schema_collections_and_alldocs(force_refresh_of_alldocs=True) @@ -596,6 +625,10 @@ def test_find_data_objects_for_study_having_one(api_site_client): ) if "data_object" in fakes: mdb.get_collection(name="data_object_set").delete_one({"id": data_object_id}) + if "workflow_execution" in fakes: + mdb.get_collection(name="workflow_execution_set").delete_one( + {"id": workflow_execution_id} + ) mdb.get_collection(name="alldocs").delete_many({}) From ad079b401097ac86d6f8580fe64447afa5f44d4c Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 20 Dec 2024 10:27:23 -0800 Subject: [PATCH 27/70] filter GOLD projects based on projectStatus --- nmdc_runtime/site/translation/gold_translator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nmdc_runtime/site/translation/gold_translator.py b/nmdc_runtime/site/translation/gold_translator.py index 73572bb9..4a10d76a 100644 --- a/nmdc_runtime/site/translation/gold_translator.py +++ b/nmdc_runtime/site/translation/gold_translator.py @@ -37,6 +37,8 @@ def __init__( for biosample in biosamples if any( project.get("sequencingStrategy") in SEQUENCING_STRATEGIES + and project.get("projectStatus") + in ("Permanent Draft", "Complete and Published") for project in biosample.get("projects", []) ) ] From 3fe47a0d7794e5933015a7e274c2ea3026c746b4 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 26 Dec 2024 12:56:58 -0800 Subject: [PATCH 28/70] dagster harness for missing records updater --- nmdc_runtime/site/graphs.py | 14 ++++++++++++++ nmdc_runtime/site/ops.py | 32 +++++++++++++++++++++++++++++++- nmdc_runtime/site/resources.py | 2 +- nmdc_runtime/site/util.py | 9 +++++++-- nmdc_runtime/site/workspace.yaml | 2 +- 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index fbdd4549..29f3926b 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -57,6 +57,9 @@ get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, ncbi_submission_xml_asset, + get_database_updater_inputs, + nmdc_study_id_filename, + missing_data_generation_repair, ) from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id @@ -467,3 +470,14 @@ def nmdc_study_to_ncbi_submission_export(): all_instruments, ) ncbi_submission_xml_asset(xml_data) + + +@graph +def fill_missing_data_generation_data_object_records(): + study_id = get_database_updater_inputs() + database = missing_data_generation_repair(study_id) + + database_dict = nmdc_schema_object_to_dict(database) + filename = nmdc_study_id_filename(study_id) + outputs = export_json_to_drs(database_dict, filename) + add_output_run_event(outputs) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index a9516c3f..d3283225 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.submission_portal_translator import ( SubmissionPortalTranslator, ) -from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id +from nmdc_runtime.site.repair.database_updater import DatabaseUpdater +from nmdc_runtime.site.util import ( + run_and_log, + schema_collection_has_index_on_id, + nmdc_study_id_to_filename, +) from nmdc_runtime.util import ( drs_object_in_for, get_names_of_classes_in_effective_range_of_slot, @@ -1241,3 +1246,28 @@ def ncbi_submission_xml_from_nmdc_study( all_instruments, ) return ncbi_xml + + +@op +def nmdc_study_id_filename(nmdc_study_id: str) -> str: + filename = nmdc_study_id_to_filename(nmdc_study_id) + return f"missing_database_records_for_{filename}.json" + + +@op( + config_schema={"nmdc_study_id": str}, + out={"nmdc_study_id": Out(str)}, +) +def get_database_updater_inputs(context: OpExecutionContext) -> str: + return context.op_config["nmdc_study_id"] + + +@op(required_resource_keys={"runtime_api_user_client"}) +def missing_data_generation_repair( + context: OpExecutionContext, nmdc_study_id: str +) -> nmdc.Database: + client: RuntimeApiUserClient = context.resources.runtime_api_user_client + database_updater = DatabaseUpdater(client, nmdc_study_id) + database = database_updater.get_database() + + return database diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 7ceb693d..d827a75d 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -134,7 +134,7 @@ def get_biosamples_for_study(self, study_id: str): f"/queries:run", { "find": "biosample_set", - "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}}, + "filter": {"associated_studies": {"$elemMatch": {"$eq": study_id}}}, }, ) response.raise_for_status() diff --git a/nmdc_runtime/site/util.py b/nmdc_runtime/site/util.py index 4280fe65..1f09cb6d 100644 --- a/nmdc_runtime/site/util.py +++ b/nmdc_runtime/site/util.py @@ -1,8 +1,9 @@ import os -from functools import lru_cache -from subprocess import Popen, PIPE, STDOUT, CalledProcessError +from dagster import op +from functools import lru_cache from pymongo.database import Database as MongoDatabase +from subprocess import Popen, PIPE, STDOUT, CalledProcessError from nmdc_runtime.api.db.mongo import get_collection_names_from_schema from nmdc_runtime.site.resources import mongo_resource @@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict: def get_basename(filename: str) -> str: return os.path.basename(filename) + + +def nmdc_study_id_to_filename(nmdc_study_id: str) -> str: + return nmdc_study_id.replace(":", "_").replace("-", "_") diff --git a/nmdc_runtime/site/workspace.yaml b/nmdc_runtime/site/workspace.yaml index 5da09ab9..531ad21e 100644 --- a/nmdc_runtime/site/workspace.yaml +++ b/nmdc_runtime/site/workspace.yaml @@ -13,7 +13,7 @@ load_from: attribute: biosample_submission_ingest - python_package: package_name: nmdc_runtime.site.repository - attribute: biosample_export + attribute: database_record_repair # - python_package: # package_name: nmdc_runtime.site.repository # attribute: validation From 8af14933bf6e374e1c9973318c7b8f9df7dabc71 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 26 Dec 2024 13:08:42 -0800 Subject: [PATCH 29/70] stub of DatabaseUpdater class --- nmdc_runtime/site/repair/__init__.py | 0 nmdc_runtime/site/repair/database_updater.py | 16 ++++++++++ nmdc_runtime/site/repository.py | 33 ++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 nmdc_runtime/site/repair/__init__.py create mode 100644 nmdc_runtime/site/repair/database_updater.py diff --git a/nmdc_runtime/site/repair/__init__.py b/nmdc_runtime/site/repair/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py new file mode 100644 index 00000000..28aed0a7 --- /dev/null +++ b/nmdc_runtime/site/repair/database_updater.py @@ -0,0 +1,16 @@ +from nmdc_runtime.site.resources import RuntimeApiUserClient +from nmdc_schema import nmdc + + +class DatabaseUpdater: + def __init__(self, runtime_api_user_client: RuntimeApiUserClient, study_id: str): + self.runtime_api_user_client = runtime_api_user_client + self.study_id = study_id + + def create_missing_dg_records(self): + pass + + def get_database(self) -> nmdc.Database: + database = nmdc.Database() + self.create_missing_dg_records() + return database diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index a1477394..1e4fe10a 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -44,6 +44,7 @@ ingest_neon_surface_water_metadata, ensure_alldocs, nmdc_study_to_ncbi_submission_export, + fill_missing_data_generation_data_object_records, ) from nmdc_runtime.site.resources import ( get_mongo, @@ -922,6 +923,38 @@ def biosample_export(): ] +@repository +def database_record_repair(): + normal_resources = run_config_frozen__normal_env["resources"] + return [ + fill_missing_data_generation_data_object_records.to_job( + resource_defs=resource_defs, + config={ + "resources": merge( + unfreeze(normal_resources), + { + "runtime_api_user_client": { + "config": { + "base_url": {"env": "API_HOST"}, + "username": {"env": "API_ADMIN_USER"}, + "password": {"env": "API_ADMIN_PASS"}, + }, + }, + }, + ), + "ops": { + "get_database_updater_inputs": { + "config": { + "nmdc_study_id": "", + } + }, + "export_json_to_drs": {"config": {"username": ""}}, + }, + }, + ), + ] + + # @repository # def validation(): # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job] From ae220bed9adb589da3b57d0c081ad3ae5f543136 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 27 Dec 2024 13:59:13 -0800 Subject: [PATCH 30/70] harness updates to accommodate DatabaseUpdater --- nmdc_runtime/site/graphs.py | 6 +++-- nmdc_runtime/site/ops.py | 48 ++++++++++++++++++++++++++------- nmdc_runtime/site/repository.py | 16 +++++++++++ nmdc_runtime/site/resources.py | 12 +++++++++ 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 29f3926b..f2e844e7 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -474,8 +474,10 @@ def nmdc_study_to_ncbi_submission_export(): @graph def fill_missing_data_generation_data_object_records(): - study_id = get_database_updater_inputs() - database = missing_data_generation_repair(study_id) + (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs() + gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url) + + database = missing_data_generation_repair(study_id, gold_nmdc_instrument_map_df) database_dict = nmdc_schema_object_to_dict(database) filename = nmdc_study_id_filename(study_id) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index d3283225..019a6b91 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1255,19 +1255,49 @@ def nmdc_study_id_filename(nmdc_study_id: str) -> str: @op( - config_schema={"nmdc_study_id": str}, - out={"nmdc_study_id": Out(str)}, + config_schema={ + "nmdc_study_id": str, + "gold_nmdc_instrument_mapping_file_url": str, + }, + out={ + "nmdc_study_id": Out(str), + "gold_nmdc_instrument_mapping_file_url": Out(str), + }, ) -def get_database_updater_inputs(context: OpExecutionContext) -> str: - return context.op_config["nmdc_study_id"] +def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]: + return ( + context.op_config["nmdc_study_id"], + context.op_config["gold_nmdc_instrument_mapping_file_url"], + ) -@op(required_resource_keys={"runtime_api_user_client"}) +@op( + required_resource_keys={ + "runtime_api_user_client", + "runtime_api_site_client", + "gold_api_client", + } +) def missing_data_generation_repair( - context: OpExecutionContext, nmdc_study_id: str + context: OpExecutionContext, + nmdc_study_id: str, + gold_nmdc_instrument_map_df: pd.DataFrame, ) -> nmdc.Database: - client: RuntimeApiUserClient = context.resources.runtime_api_user_client - database_updater = DatabaseUpdater(client, nmdc_study_id) - database = database_updater.get_database() + runtime_api_user_client: RuntimeApiUserClient = ( + context.resources.runtime_api_user_client + ) + runtime_api_site_client: RuntimeApiSiteClient = ( + context.resources.runtime_api_site_client + ) + gold_api_client: GoldApiClient = context.resources.gold_api_client + + database_updater = DatabaseUpdater( + runtime_api_user_client, + runtime_api_site_client, + gold_api_client, + nmdc_study_id, + gold_nmdc_instrument_map_df, + ) + database = database_updater.create_missing_dg_records() return database diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 1e4fe10a..ee1bcdbb 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -940,12 +940,28 @@ def database_record_repair(): "password": {"env": "API_ADMIN_PASS"}, }, }, + "runtime_api_site_client": { + "config": { + "base_url": {"env": "API_HOST"}, + "client_id": {"env": "API_SITE_CLIENT_ID"}, + "client_secret": {"env": "API_SITE_CLIENT_SECRET"}, + "site_id": {"env": "API_SITE_ID"}, + }, + }, + "gold_api_client": { + "config": { + "base_url": {"env": "GOLD_API_BASE_URL"}, + "username": {"env": "GOLD_API_USERNAME"}, + "password": {"env": "GOLD_API_PASSWORD"}, + }, + }, }, ), "ops": { "get_database_updater_inputs": { "config": { "nmdc_study_id": "", + "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv", } }, "export_json_to_drs": {"config": {"username": ""}}, diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index d827a75d..e382fe52 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -370,6 +370,18 @@ def fetch_study(self, id: str) -> Union[Dict[str, Any], None]: return None return results[0] + def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]: + id = self._normalize_id(biosample_id) + results = self.request("/projects", params={"biosampleGoldId": id}) + return results + + def fetch_biosample_by_biosample_id( + self, biosample_id: str + ) -> List[Dict[str, Any]]: + id = self._normalize_id(biosample_id) + results = self.request("/biosamples", params={"biosampleGoldId": id}) + return results + @resource( config_schema={ From 47320dbdeac4c839694bcdb6cc163e41b46256aa Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 27 Dec 2024 13:59:44 -0800 Subject: [PATCH 31/70] logic to make DataGeneration records based on GOLD ids --- nmdc_runtime/site/repair/database_updater.py | 102 +++++++++++++++++-- 1 file changed, 96 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 28aed0a7..490240e8 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -1,16 +1,106 @@ -from nmdc_runtime.site.resources import RuntimeApiUserClient +from functools import lru_cache +import pandas as pd +from nmdc_runtime.site.resources import ( + RuntimeApiUserClient, + RuntimeApiSiteClient, + GoldApiClient, +) +from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator from nmdc_schema import nmdc class DatabaseUpdater: - def __init__(self, runtime_api_user_client: RuntimeApiUserClient, study_id: str): + def __init__( + self, + runtime_api_user_client: RuntimeApiUserClient, + runtime_api_site_client: RuntimeApiSiteClient, + gold_api_client: GoldApiClient, + study_id: str, + gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(), + ): self.runtime_api_user_client = runtime_api_user_client + self.runtime_api_site_client = runtime_api_site_client + self.gold_api_client = gold_api_client self.study_id = study_id + self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df + @lru_cache def create_missing_dg_records(self): - pass - - def get_database(self) -> nmdc.Database: database = nmdc.Database() - self.create_missing_dg_records() + + biosample_set = self.runtime_api_user_client.get_biosamples_for_study( + self.study_id + ) + + all_gold_biosamples = [] + all_gold_projects = [] + for biosample in biosample_set: + gold_biosample_identifiers = biosample.get("gold_biosample_identifiers") + if gold_biosample_identifiers: + gold_biosample_id = gold_biosample_identifiers[0] + gold_biosample = self.gold_api_client.fetch_biosample_by_biosample_id( + gold_biosample_id + )[0] + gold_projects = self.gold_api_client.fetch_projects_by_biosample( + gold_biosample_id + ) + gold_biosample["projects"] = gold_projects + all_gold_biosamples.append(gold_biosample) + all_gold_projects.extend(gold_projects) + + gold_study_translator = GoldStudyTranslator( + biosamples=all_gold_biosamples, + projects=all_gold_projects, + gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df, + ) + + filtered_biosamples = gold_study_translator.biosamples + filtered_projects = gold_study_translator.projects + + gold_project_ids = [project["projectGoldId"] for project in filtered_projects] + nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id( + "nmdc:NucleotideSequencing", len(gold_project_ids) + ).json() + gold_project_to_nmdc_nucleotide_sequencing_ids = dict( + zip(gold_project_ids, nmdc_nucleotide_sequencing_ids) + ) + + gold_to_nmdc_biosample_ids = { + biosample["gold_biosample_identifiers"][0].replace("gold:", ""): biosample[ + "id" + ] + for biosample in biosample_set + if "gold_biosample_identifiers" in biosample + and biosample["gold_biosample_identifiers"] + } + + database.data_generation_set = [] + for project in filtered_projects: + # Determine biosampleGoldId from filtered_biosamples + biosample_gold_id = next( + ( + biosample["biosampleGoldId"] + for biosample in filtered_biosamples + if any( + p["projectGoldId"] == project["projectGoldId"] + for p in biosample.get("projects", []) + ) + ), + None, + ) + + if biosample_gold_id: + nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id) + if nmdc_biosample_id: + database.data_generation_set.append( + gold_study_translator._translate_nucleotide_sequencing( + project, + nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[ + project["projectGoldId"] + ], + nmdc_biosample_id=nmdc_biosample_id, + nmdc_study_id=self.study_id, + ) + ) + return database From 4434739d6f29c5ef3fcc154e784fd6e8d1875318 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 30 Dec 2024 14:42:39 -0800 Subject: [PATCH 32/70] added tests for DatabaseUpdater --- nmdc_runtime/site/repair/database_updater.py | 1 - tests/test_data/test_database_updater.py | 114 +++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 tests/test_data/test_database_updater.py diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 490240e8..e0a5170e 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -76,7 +76,6 @@ def create_missing_dg_records(self): database.data_generation_set = [] for project in filtered_projects: - # Determine biosampleGoldId from filtered_biosamples biosample_gold_id = next( ( biosample["biosampleGoldId"] diff --git a/tests/test_data/test_database_updater.py b/tests/test_data/test_database_updater.py new file mode 100644 index 00000000..a96f3efa --- /dev/null +++ b/tests/test_data/test_database_updater.py @@ -0,0 +1,114 @@ +import pytest + +import pandas as pd + +from unittest.mock import MagicMock, patch + +from nmdc_runtime.site.repair.database_updater import DatabaseUpdater + + +@pytest.fixture +def test_setup(test_minter): + mock_runtime_api_user_client = MagicMock() + mock_runtime_api_site_client = MagicMock() + mock_gold_api_client = MagicMock() + + study_id = "nmdc:sty-11-e4yb9z58" + mock_gold_nmdc_instrument_map_df = pd.DataFrame( + { + "GOLD SeqMethod": [ + "Illumina HiSeq", + "Illumina HiSeq 2500-1TB", + ], + "NMDC instrument_set id": [ + "nmdc:inst-14-79zxap02", + "nmdc:inst-14-nn4b6k72", + ], + } + ) + + mint_id_mock = MagicMock() + mint_id_mock.json.return_value = test_minter("nmdc:NucleotideSequencing", 1) + mock_runtime_api_site_client.mint_id.return_value = mint_id_mock + + database_updater = DatabaseUpdater( + runtime_api_user_client=mock_runtime_api_user_client, + runtime_api_site_client=mock_runtime_api_site_client, + gold_api_client=mock_gold_api_client, + study_id=study_id, + gold_nmdc_instrument_map_df=mock_gold_nmdc_instrument_map_df, + ) + + return { + "runtime_api_user_client": mock_runtime_api_user_client, + "runtime_api_site_client": mock_runtime_api_site_client, + "gold_api_client": mock_gold_api_client, + "database_updater": database_updater, + "study_id": study_id, + } + + +@patch("nmdc_runtime.site.repair.database_updater.GoldStudyTranslator") +def test_create_missing_dg_records(MockGoldStudyTranslator, test_setup): + mock_runtime_api_user_client = test_setup["runtime_api_user_client"] + mock_runtime_api_site_client = test_setup["runtime_api_site_client"] + mock_gold_api_client = test_setup["gold_api_client"] + database_updater = test_setup["database_updater"] + + mock_runtime_api_user_client.get_biosamples_for_study.return_value = [ + { + "id": "nmdc:bsm-11-q59jb831", + "gold_biosample_identifiers": ["gold:Gb0150488"], + } + ] + + mock_gold_api_client.fetch_biosample_by_biosample_id.return_value = [ + { + "biosampleGoldId": "Gb0150488", + "biosampleName": "Switchgrass phyllosphere microbial communities", + "projects": [ + { + "projectGoldId": "Gp0208640", + "biosampleGoldId": "Gb0150488", + "sequencingStrategy": "Metagenome", + } + ], + } + ] + + mock_gold_api_client.fetch_projects_by_biosample.return_value = [ + { + "projectGoldId": "Gp0208640", + "biosampleGoldId": "Gb0150488", + "sequencingStrategy": "Metagenome", + } + ] + + MockGoldStudyTranslator.return_value.biosamples = [ + {"biosampleGoldId": "Gb0150488", "projects": [{"projectGoldId": "Gp0208640"}]} + ] + MockGoldStudyTranslator.return_value.projects = [{"projectGoldId": "Gp0208640"}] + + MockGoldStudyTranslator.return_value._translate_nucleotide_sequencing.return_value = MagicMock( + id="nmdc:dgns-00-12345678", + biosample_id="nmdc:bsm-11-q59jb831", + ) + + database = database_updater.create_missing_dg_records() + + assert len(database.data_generation_set) > 0 + assert database.data_generation_set[0].id.startswith("nmdc:dgns-00-") + assert database.data_generation_set[0].biosample_id == "nmdc:bsm-11-q59jb831" + + mock_runtime_api_user_client.get_biosamples_for_study.assert_called_once_with( + test_setup["study_id"] + ) + mock_gold_api_client.fetch_biosample_by_biosample_id.assert_called_once_with( + "gold:Gb0150488" + ) + mock_gold_api_client.fetch_projects_by_biosample.assert_called_once_with( + "gold:Gb0150488" + ) + mock_runtime_api_site_client.mint_id.assert_called_once_with( + "nmdc:NucleotideSequencing", 1 + ) From faf4bf785257a113d7671539782c4f95ce31e010 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 2 Jan 2025 12:30:23 -0800 Subject: [PATCH 33/70] add documentation for logic in the DatabaseUpdater class --- nmdc_runtime/site/repair/database_updater.py | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index e0a5170e..1034aaf9 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -18,6 +18,21 @@ def __init__( study_id: str, gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(), ): + """This class serves as an API for repairing connections in the database by + adding records that are essentially missing "links"/"connections". As we identify + common use cases for adding missing records to the database, we can + add helper methods to this class. + + :param runtime_api_user_client: An object of RuntimeApiUserClient which can be + used to retrieve instance records from the NMDC database. + :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be + used to mint new IDs for the repaired records that need to be added into the NMDC database. + :param gold_api_client: An object of GoldApiClient which can be used to retrieve + records from GOLD via the GOLD API. + :param study_id: NMDC study ID for which the missing records need to be added. + :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the + NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records. + """ self.runtime_api_user_client = runtime_api_user_client self.runtime_api_site_client = runtime_api_site_client self.gold_api_client = gold_api_client @@ -26,6 +41,17 @@ def __init__( @lru_cache def create_missing_dg_records(self): + """This method creates missing data generation records for a given study in the NMDC database using + metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated + with the study from the NMDC database. Then, it fetches all the biosample and project data data + associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id + mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class + to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based + on the number of GOLD sequencing projects, and then reimplement only the part of logic from that + class which is responsible for making data_generation_set records. + + :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend. + """ database = nmdc.Database() biosample_set = self.runtime_api_user_client.get_biosamples_for_study( @@ -54,6 +80,8 @@ def create_missing_dg_records(self): gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df, ) + # The GoldStudyTranslator class has some pre-processing logic which filters out + # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.) filtered_biosamples = gold_study_translator.biosamples filtered_projects = gold_study_translator.projects @@ -75,7 +103,10 @@ def create_missing_dg_records(self): } database.data_generation_set = [] + # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records + # created is based on the number of GOLD sequencing projects for project in filtered_projects: + # map the projectGoldId to the NMDC biosample ID biosample_gold_id = next( ( biosample["biosampleGoldId"] From b028c8f78a4c6ca6f032180ca31a453647f8a807 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 2 Jan 2025 17:08:50 -0800 Subject: [PATCH 34/70] improve caching in DatabaseUpdater --- nmdc_runtime/site/repair/database_updater.py | 27 +++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 1034aaf9..bf213d0d 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -1,4 +1,5 @@ from functools import lru_cache +from typing import Any, Dict, List import pandas as pd from nmdc_runtime.site.resources import ( RuntimeApiUserClient, @@ -39,6 +40,24 @@ def __init__( self.study_id = study_id self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df + @lru_cache + def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]: + """Fetch response from GOLD /biosamples API for a given biosample id. + + :param gold_biosample_id: GOLD biosample ID. + :return: Dictionary containing the response from the GOLD /biosamples API. + """ + return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id) + + @lru_cache + def _fetch_gold_projects(self, gold_biosample_id: str): + """Fetch response from GOLD /projects API for a given biosample id. + + :param gold_biosample_id: GOLD biosample ID + :return: Dictionary containing the response from the GOLD /projects API. + """ + return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id) + @lru_cache def create_missing_dg_records(self): """This method creates missing data generation records for a given study in the NMDC database using @@ -64,12 +83,8 @@ class which is responsible for making data_generation_set records. gold_biosample_identifiers = biosample.get("gold_biosample_identifiers") if gold_biosample_identifiers: gold_biosample_id = gold_biosample_identifiers[0] - gold_biosample = self.gold_api_client.fetch_biosample_by_biosample_id( - gold_biosample_id - )[0] - gold_projects = self.gold_api_client.fetch_projects_by_biosample( - gold_biosample_id - ) + gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0] + gold_projects = self._fetch_gold_projects(gold_biosample_id) gold_biosample["projects"] = gold_projects all_gold_biosamples.append(gold_biosample) all_gold_projects.extend(gold_projects) From 128906f46f21f1a21eed4841123b078b99c47177 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Tue, 7 Jan 2025 12:48:59 -0800 Subject: [PATCH 35/70] modify method that gets biosamples based on study --- nmdc_runtime/site/repair/database_updater.py | 28 ++++++++++---------- nmdc_runtime/site/resources.py | 17 ++++++++---- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index bf213d0d..4edfcd49 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -82,12 +82,13 @@ class which is responsible for making data_generation_set records. for biosample in biosample_set: gold_biosample_identifiers = biosample.get("gold_biosample_identifiers") if gold_biosample_identifiers: - gold_biosample_id = gold_biosample_identifiers[0] - gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0] - gold_projects = self._fetch_gold_projects(gold_biosample_id) - gold_biosample["projects"] = gold_projects - all_gold_biosamples.append(gold_biosample) - all_gold_projects.extend(gold_projects) + for gold_biosample_id in gold_biosample_identifiers: + gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0] + gold_projects = self._fetch_gold_projects(gold_biosample_id) + gold_biosample["projects"] = gold_projects + + all_gold_biosamples.append(gold_biosample) + all_gold_projects.extend(gold_projects) gold_study_translator = GoldStudyTranslator( biosamples=all_gold_biosamples, @@ -108,14 +109,13 @@ class which is responsible for making data_generation_set records. zip(gold_project_ids, nmdc_nucleotide_sequencing_ids) ) - gold_to_nmdc_biosample_ids = { - biosample["gold_biosample_identifiers"][0].replace("gold:", ""): biosample[ - "id" - ] - for biosample in biosample_set - if "gold_biosample_identifiers" in biosample - and biosample["gold_biosample_identifiers"] - } + gold_to_nmdc_biosample_ids = {} + + for biosample in biosample_set: + gold_ids = biosample.get("gold_biosample_identifiers", []) + for gold_id in gold_ids: + gold_id_stripped = gold_id.replace("gold:", "") + gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"] database.data_generation_set = [] # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index e382fe52..c00b0900 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -129,16 +129,23 @@ def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str): return response.json()["cursor"]["firstBatch"] def get_biosamples_for_study(self, study_id: str): + # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param. + # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism, + # but the tradeoff there is that we would need to make multiple requests to step through the + # each of the pages. By picking a large number for max_page_size, we can get all the results + # in a single request. + # This method previously used the /queries:run endpoint but the problem with that was that + # it used to truncate the number of results returned to 100. response = self.request( - "POST", - f"/queries:run", + "GET", + f"/nmdcschema/biosample_set", { - "find": "biosample_set", - "filter": {"associated_studies": {"$elemMatch": {"$eq": study_id}}}, + "filter": json.dumps({"associated_studies": study_id}), + "max_page_size": 10000, }, ) response.raise_for_status() - return response.json()["cursor"]["firstBatch"] + return response.json()["resources"] def get_omics_processing_by_name(self, name: str): response = self.request( From dfbb1feee47ed3fd5955fe8ec30ef8b3f11cdd4a Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 9 Jan 2025 18:04:20 -0800 Subject: [PATCH 36/70] remove cache decorator on create_missing_dg_records() --- nmdc_runtime/site/repair/database_updater.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 4edfcd49..27d91ce7 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -58,7 +58,6 @@ def _fetch_gold_projects(self, gold_biosample_id: str): """ return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id) - @lru_cache def create_missing_dg_records(self): """This method creates missing data generation records for a given study in the NMDC database using metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated From 497ef60fbdbb2738f118371bbb595e97e9758588 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 15:59:34 -0800 Subject: [PATCH 37/70] Delegate more referential integrity checking steps to `refscan` library --- nmdc_runtime/api/endpoints/metadata.py | 2 +- nmdc_runtime/util.py | 119 +++++++------------- requirements/main.in | 2 +- requirements/main.txt | 146 ++++++++++++++----------- 4 files changed, 125 insertions(+), 144 deletions(-) diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py index 5b30c77f..02221c30 100644 --- a/nmdc_runtime/api/endpoints/metadata.py +++ b/nmdc_runtime/api/endpoints/metadata.py @@ -174,7 +174,7 @@ async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mong """ - return validate_json(docs, mdb) + return validate_json(docs, mdb, check_inter_document_references=True) @router.post("/metadata/json:submit", name="Submit JSON") diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 780ec865..a6c264c7 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -22,13 +22,10 @@ from pydantic import Field, BaseModel from pymongo.database import Database as MongoDatabase from pymongo.errors import OperationFailure -from refscan.lib.helpers import ( - derive_schema_class_name_from_document, - identify_references, -) +from refscan.lib.helpers import identify_references from refscan.lib.Finder import Finder from refscan.lib.ReferenceList import ReferenceList -from refscan.lib.Violation import Violation +from refscan.scanner import scan_outgoing_references from toolz import merge, unique from nmdc_runtime.api.core.util import sha256hash_from_file @@ -333,6 +330,14 @@ def nmdc_database_collection_instance_class_names(): @lru_cache def nmdc_database_collection_names(): + r""" + TODO: Document this function. + + TODO: Assuming this function was designed to return a list of names of all Database slots that represents database + collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py` + instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema + maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`). + """ names = [] view = nmdc_schema_view() all_classes = set(view.all_classes()) @@ -578,10 +583,10 @@ def merge_find(self, coll_name, find_spec: dict): yield doc -def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = False): +def validate_json(in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False): r""" Checks whether the specified dictionary represents a valid instance of the `Database` class - defined in the NMDC Schema. + defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis. Example dictionary: { @@ -595,12 +600,13 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Fa ] } - :param dict in_docs: The dictionary you want to validate - :param MongoDatabase mdb: A reference to a MongoDB database - :param bool check_references: Whether you want this function to check whether every document that is referenced - by any of the documents passed in would, indeed, exist in the database, if the - documents passed in were to be inserted into the database. In other words, set this - to `True` if you want this function to perform referential integrity checks. + :param in_docs: The dictionary you want to validate + :param mdb: A reference to a MongoDB database + :param check_inter_document_references: Whether you want this function to check whether every document that + is referenced by any of the documents passed in would, indeed, exist + in the database, if the documents passed in were to be inserted into + the database. In other words, set this to `True` if you want this + function to perform referential integrity checks. """ validator = Draft7Validator(get_nmdc_jsonschema_dict()) docs = deepcopy(in_docs) @@ -642,14 +648,14 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Fa return {"result": "errors", "detail": str(e)} # Third pass (if enabled): Check inter-document references. - if check_references is True: + if check_inter_document_references is True: # Insert all documents specified for all collections specified, into the OverlayDB. # # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, # instead, validate it after processing _each_ collection, we would get a false positive if a document # inserted into an earlier-processed collection happened to reference a document slated for insertion # into a later-processed collection. By waiting until all documents in all collections specified have - # been inserted, we avoid that scenario. + # been inserted, we avoid that situation. # with OverlayDB(mdb) as overlay_db: print(f"Inserting documents into the OverlayDB.") @@ -674,74 +680,29 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_references: bool = Fa # Note: Much of this code was copy/pasted from refscan, at: # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 # - print( - f"Checking references emanating from documents inserted into '{source_collection_name}'." - ) + print(f"Checking references emanating from documents inserted into '{source_collection_name}'.") for document in documents_inserted: - # Get the document's schema class name so that we can interpret its fields accordingly. - source_class_name = derive_schema_class_name_from_document( - schema_view=nmdc_schema_view(), + violations = scan_outgoing_references( document=document, + schema_view=nmdc_schema_view(), + reference_field_names_by_source_class_name=reference_field_names_by_source_class_name, + references=references, + finder=finder, + collection_names=nmdc_database_collection_names(), + source_collection_name=source_collection_name, + user_wants_to_locate_misplaced_documents=False, ) - - # Get the names of that class's fields that can contain references. - names_of_reference_fields = ( - reference_field_names_by_source_class_name.get( - source_class_name, [] + for violation in violations: + violation_as_str = ( + f"Document '{violation.source_document_id}' " + f"in collection '{violation.source_collection_name}' " + f"has a field '{violation.source_field_name}' that " + f"references a document having id " + f"'{violation.target_id}', but the latter document " + f"does not exist in any of the collections the " + f"NMDC Schema says it can exist in." ) - ) - - # Check each field that both (a) exists in the document and (b) can contain a reference. - for field_name in names_of_reference_fields: - if field_name in document: - # Determine which collections can contain the referenced document, based upon - # the schema class of which this source document is an instance. - target_collection_names = ( - references.get_target_collection_names( - source_class_name=source_class_name, - source_field_name=field_name, - ) - ) - - # Handle both the multi-value (array) and the single-value (scalar) case, - # normalizing the value or values into a list of values in either case. - if type(document[field_name]) is list: - target_ids = document[field_name] - else: - target_id = document[field_name] - target_ids = [target_id] # makes a one-item list - - for target_id in target_ids: - name_of_collection_containing_target_document = finder.check_whether_document_having_id_exists_among_collections( - collection_names=target_collection_names, - document_id=target_id, - ) - if ( - name_of_collection_containing_target_document - is None - ): - violation = Violation( - source_collection_name=source_collection_name, - source_field_name=field_name, - source_document_object_id=document.get( - "_id" - ), - source_document_id=document.get("id"), - target_id=target_id, - name_of_collection_containing_target=name_of_collection_containing_target_document, - ) - violation_as_str = ( - f"Document '{violation.source_document_id}' " - f"in collection '{violation.source_collection_name}' " - f"has a field '{violation.source_field_name}' that " - f"references a document having id " - f"'{violation.target_id}', but the latter document " - f"does not exist in any of the collections the " - f"NMDC Schema says it can exist in." - ) - validation_errors[ - source_collection_name - ].append(violation_as_str) + validation_errors[source_collection_name].append(violation_as_str) # If any collection's error list is not empty, return an error response. if any(len(v) > 0 for v in validation_errors.values()): diff --git a/requirements/main.in b/requirements/main.in index 9d0e1481..35078e3f 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -39,7 +39,7 @@ python-multipart>=0.0.18 pyyaml # Note: We use `refscan` to get information about inter-document references from the schema and database. # Reference: https://pypi.org/project/refscan/ -refscan==0.1.22 +refscan==0.2.0 requests semver setuptools-scm diff --git a/requirements/main.txt b/requirements/main.txt index 5f683fa5..5e9e9b83 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile --allow-unsafe --output-file=requirements/main.txt --strip-extras requirements/main.in @@ -14,13 +14,15 @@ antlr4-python3-runtime==4.9.3 # linkml # pyjsg # pyshexc -anyio==4.7.0 +anyio==4.8.0 # via # gql # httpx # jupyter-server # starlette # watchfiles +appnope==0.1.4 + # via ipykernel argon2-cffi==23.1.0 # via jupyter-server argon2-cffi-bindings==21.2.0 @@ -31,7 +33,7 @@ asttokens==3.0.0 # via stack-data async-lru==2.0.4 # via jupyterlab -attrs==24.2.0 +attrs==24.3.0 # via # cattrs # jsonschema @@ -47,7 +49,7 @@ base32-lib==1.0.2 # via -r requirements/main.in bcrypt==4.2.1 # via passlib -beanie==1.27.0 +beanie==1.29.0 # via -r requirements/main.in beautifulsoup4==4.12.3 # via @@ -56,15 +58,15 @@ beautifulsoup4==4.12.3 # nbconvert bleach==6.2.0 # via nbconvert -boto3==1.35.76 +boto3==1.35.98 # via -r requirements/main.in -botocore==1.35.76 +botocore==1.35.98 # via # boto3 # s3transfer cattrs==24.1.2 # via requests-cache -certifi==2024.8.30 +certifi==2024.12.14 # via # httpcore # httpx @@ -79,9 +81,9 @@ chardet==5.2.0 # via # pyshex # pyshexc -charset-normalizer==3.4.0 +charset-normalizer==3.4.1 # via requests -click==8.1.7 +click==8.1.8 # via # -r requirements/main.in # beanie @@ -102,39 +104,39 @@ comm==0.2.2 # via # ipykernel # ipywidgets -croniter==3.0.4 +croniter==5.0.1 # via dagster cryptography==44.0.0 # via python-jose -curies==0.9.0 +curies==0.10.2 # via # linkml-runtime # prefixmaps -dagit==1.9.3 +dagit==1.9.8 # via -r requirements/main.in -dagster==1.9.3 +dagster==1.9.8 # via # -r requirements/main.in # dagster-graphql # dagster-postgres # dagster-webserver -dagster-graphql==1.9.3 +dagster-graphql==1.9.8 # via # -r requirements/main.in # dagster-webserver -dagster-pipes==1.9.3 +dagster-pipes==1.9.8 # via dagster -dagster-postgres==0.25.3 +dagster-postgres==0.25.8 # via -r requirements/main.in -dagster-webserver==1.9.3 +dagster-webserver==1.9.8 # via dagit -debugpy==1.8.9 +debugpy==1.8.11 # via ipykernel decorator==5.1.1 # via ipython defusedxml==0.7.1 # via nbconvert -dependency-injector==4.43.0 +dependency-injector==4.45.0 # via -r requirements/main.in deprecated==1.2.15 # via linkml-runtime @@ -148,12 +150,14 @@ dotted-dict==1.1.3 # via -r requirements/main.in ecdsa==0.19.0 # via python-jose -editorconfig==0.12.4 +editorconfig==0.17.0 # via jsbeautifier email-validator==2.2.0 # via pydantic et-xmlfile==2.0.0 # via openpyxl +eval-type-backport==0.2.2 + # via curies exceptiongroup==1.2.2 # via # anyio @@ -176,7 +180,7 @@ fqdn==1.5.1 # via jsonschema frozendict==2.4.6 # via -r requirements/main.in -fsspec==2024.10.0 +fsspec==2024.12.0 # via universal-pathlib ghp-import==2.1.0 # via mkdocs @@ -195,13 +199,11 @@ graphql-relay==3.2.0 # via graphene graphviz==0.20.3 # via linkml -greenlet==3.1.1 - # via sqlalchemy -grpcio==1.68.1 +grpcio==1.69.0 # via # dagster # grpcio-health-checking -grpcio-health-checking==1.62.3 +grpcio-health-checking==1.69.0 # via dagster h11==0.14.0 # via @@ -216,7 +218,7 @@ httpcore==1.0.7 # via httpx httptools==0.6.4 # via uvicorn -httpx==0.28.0 +httpx==0.28.1 # via jupyterlab humanfriendly==10.0 # via coloredlogs @@ -228,6 +230,16 @@ idna==3.10 # jsonschema # requests # yarl +importlib-metadata==8.5.0 + # via + # jupyter-client + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # markdown + # mkdocs + # mkdocs-get-deps + # nbconvert iniconfig==2.0.0 # via pytest ipykernel==6.29.5 @@ -236,7 +248,7 @@ ipykernel==6.29.5 # jupyter-console # jupyterlab # mkdocs-jupyter -ipython==8.30.0 +ipython==8.18.1 # via # ipykernel # ipywidgets @@ -251,7 +263,7 @@ isoduration==20.11.0 # via jsonschema jedi==0.19.2 # via ipython -jinja2==3.1.4 +jinja2==3.1.5 # via # dagster # jupyter-server @@ -320,11 +332,11 @@ jupyter-core==5.7.2 # nbclient # nbconvert # nbformat -jupyter-events==0.10.0 +jupyter-events==0.11.0 # via jupyter-server jupyter-lsp==2.2.5 # via jupyterlab -jupyter-server==2.14.2 +jupyter-server==2.15.0 # via # jupyter-lsp # jupyterlab @@ -333,7 +345,7 @@ jupyter-server==2.14.2 # notebook-shim jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.3.2 +jupyterlab==4.3.4 # via # -r requirements/main.in # jupyter @@ -346,11 +358,11 @@ jupyterlab-server==2.27.3 # notebook jupyterlab-widgets==3.0.13 # via ipywidgets -jupytext==1.16.4 +jupytext==1.16.6 # via mkdocs-jupyter lazy-model==0.2.0 # via beanie -linkml==1.8.5 +linkml==1.8.6 # via # -r requirements/main.in # nmdc-schema @@ -365,7 +377,7 @@ linkml-runtime==1.8.3 # refscan lxml==5.3.0 # via -r requirements/main.in -mako==1.3.7 +mako==1.3.8 # via alembic markdown==3.7 # via @@ -395,7 +407,7 @@ mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps -mistune==3.0.2 +mistune==3.1.0 # via nbconvert mkdocs==1.6.1 # via @@ -408,7 +420,7 @@ mkdocs-get-deps==0.2.0 # via mkdocs mkdocs-jupyter==0.25.1 # via -r requirements/main.in -mkdocs-material==9.5.47 +mkdocs-material==9.5.49 # via # -r requirements/main.in # mkdocs-jupyter @@ -427,9 +439,9 @@ motor==3.6.0 # beanie multidict==6.1.0 # via yarl -nbclient==0.10.1 +nbclient==0.10.2 # via nbconvert -nbconvert==7.16.4 +nbconvert==7.16.5 # via # jupyter # jupyter-server @@ -444,13 +456,13 @@ nest-asyncio==1.6.0 # via ipykernel nmdc-schema==11.2.1 # via -r requirements/main.in -notebook==7.3.1 +notebook==7.3.2 # via jupyter notebook-shim==0.2.4 # via # jupyterlab # notebook -numpy==2.1.3 +numpy==2.0.2 # via pandas openpyxl==3.1.5 # via @@ -511,11 +523,11 @@ prompt-toolkit==3.0.48 # jupyter-console propcache==0.2.1 # via yarl -protobuf==4.25.5 +protobuf==5.29.3 # via # dagster # grpcio-health-checking -psutil==6.1.0 +psutil==6.1.1 # via ipykernel psycopg2-binary==2.9.10 # via dagster-postgres @@ -531,7 +543,7 @@ pyasn1==0.6.1 # rsa pycparser==2.22 # via cffi -pydantic==2.9.2 +pydantic==2.10.5 # via # -r requirements/main.in # beanie @@ -541,9 +553,9 @@ pydantic==2.9.2 # lazy-model # linkml # linkml-runtime -pydantic-core==2.23.4 +pydantic-core==2.27.2 # via pydantic -pygments==2.18.0 +pygments==2.19.1 # via # ipython # jupyter-console @@ -556,7 +568,7 @@ pyjsg==0.11.10 # linkml # pyshexc # shexjsg -pymdown-extensions==10.12 +pymdown-extensions==10.14 # via # mkdocs-material # mkdocs-mermaid2-plugin @@ -566,7 +578,7 @@ pymongo==4.9.2 # motor # nmdc-schema # refscan -pyparsing==3.2.0 +pyparsing==3.2.1 # via rdflib pyshex==0.8.1 # via linkml @@ -595,9 +607,9 @@ python-dotenv==1.0.1 # uvicorn python-jose==3.3.0 # via -r requirements/main.in -python-json-logger==2.0.7 +python-json-logger==3.2.1 # via jupyter-events -python-multipart==0.0.19 +python-multipart==0.0.20 # via -r requirements/main.in pytrie==0.4.0 # via curies @@ -631,7 +643,7 @@ pyzmq==26.2.0 # jupyter-client # jupyter-console # jupyter-server -rdflib==7.1.1 +rdflib==7.1.2 # via # cfgraph # linkml @@ -652,7 +664,7 @@ referencing==0.35.1 # jsonschema # jsonschema-specifications # jupyter-events -refscan==0.1.22 +refscan==0.2.0 # via -r requirements/main.in regex==2024.11.6 # via mkdocs-material @@ -696,7 +708,7 @@ rpds-py==0.22.3 # referencing rsa==4.9 # via python-jose -ruamel-yaml==0.18.6 +ruamel-yaml==0.18.10 # via # linkml-dataops # nmdc-schema @@ -716,10 +728,9 @@ shexjsg==0.8.2 # via # pyshex # pyshexc -six==1.16.0 +six==1.17.0 # via # base32-lib - # dependency-injector # ecdsa # jsbeautifier # python-dateutil @@ -737,7 +748,7 @@ sparqlwrapper==2.0.0 # via # pyshex # sparqlslurper -sqlalchemy==2.0.36 +sqlalchemy==2.0.37 # via # alembic # dagster @@ -760,7 +771,7 @@ terminado==0.18.1 # jupyter-server # jupyter-server-terminals tinycss2==1.4.0 - # via nbconvert + # via bleach toml==0.10.2 # via beanie tomli==2.2.1 @@ -802,7 +813,7 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -typer==0.12.5 +typer==0.15.1 # via refscan types-python-dateutil==2.9.0.20241206 # via arrow @@ -813,32 +824,39 @@ typing-extensions==4.12.2 # async-lru # beanie # cattrs + # curies # dagster # fastapi # graphene + # graphql-core # ipython + # linkml + # mistune # multidict # pydantic # pydantic-core + # python-json-logger # rich + # setuptools-scm # sqlalchemy + # starlette # typer # uvicorn tzdata==2024.2 # via pandas -universal-pathlib==0.2.5 +universal-pathlib==0.2.6 # via dagster uri-template==1.3.0 # via jsonschema url-normalize==1.4.3 # via requests-cache -urllib3==2.2.3 +urllib3==1.26.20 # via # botocore # pyshex # requests # requests-cache -uvicorn==0.32.1 +uvicorn==0.34.0 # via # -r requirements/main.in # dagster-webserver @@ -849,7 +867,7 @@ watchdog==5.0.3 # dagster # linkml # mkdocs -watchfiles==1.0.0 +watchfiles==1.0.4 # via uvicorn wcwidth==0.2.13 # via prompt-toolkit @@ -865,7 +883,7 @@ websockets==14.1 # via uvicorn widgetsnbextension==4.0.13 # via ipywidgets -wrapt==1.17.0 +wrapt==1.17.1 # via deprecated xlrd==2.0.1 # via -r requirements/main.in @@ -873,9 +891,11 @@ xlsxwriter==3.2.0 # via -r requirements/main.in yarl==1.18.3 # via gql +zipp==3.21.0 + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: -setuptools==75.6.0 +setuptools==75.8.0 # via # dagster # jupyterlab From 2f149c8f6c0c8507e3ac5e4e0730d6ddadaff295 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 16:04:56 -0800 Subject: [PATCH 38/70] Explicitly state that we are ignoring a function's return value --- nmdc_runtime/api/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nmdc_runtime/api/main.py b/nmdc_runtime/api/main.py index 6498d4cb..c4fc152a 100644 --- a/nmdc_runtime/api/main.py +++ b/nmdc_runtime/api/main.py @@ -371,9 +371,10 @@ async def lifespan(app: FastAPI): ensure_initial_resources_on_boot() ensure_attribute_indexes() ensure_default_api_perms() - _ = ( - get_allowed_references() - ) # note: future invocations will benefit from the function's memoized-ness + + # Invoke a function—thereby priming its memoization cache—in order to speed up all future invocations. + get_allowed_references() # we ignore the return value here + yield From 9086a86899a30eadce97bb2fe44b62a1651806d4 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 00:23:33 +0000 Subject: [PATCH 39/70] style: reformat --- nmdc_runtime/util.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index a6c264c7..218c265d 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -583,7 +583,9 @@ def merge_find(self, coll_name, find_spec: dict): yield doc -def validate_json(in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False): +def validate_json( + in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False +): r""" Checks whether the specified dictionary represents a valid instance of the `Database` class defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis. @@ -680,7 +682,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_inter_document_refere # Note: Much of this code was copy/pasted from refscan, at: # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 # - print(f"Checking references emanating from documents inserted into '{source_collection_name}'.") + print( + f"Checking references emanating from documents inserted into '{source_collection_name}'." + ) for document in documents_inserted: violations = scan_outgoing_references( document=document, @@ -702,7 +706,9 @@ def validate_json(in_docs: dict, mdb: MongoDatabase, check_inter_document_refere f"does not exist in any of the collections the " f"NMDC Schema says it can exist in." ) - validation_errors[source_collection_name].append(violation_as_str) + validation_errors[source_collection_name].append( + violation_as_str + ) # If any collection's error list is not empty, return an error response. if any(len(v) > 0 for v in validation_errors.values()): From a4001ee5d19796e6ee56a0df8541955e7265691e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 17:43:59 -0800 Subject: [PATCH 40/70] Avoid invoking `insert_many` when there are no documents to insert --- nmdc_runtime/util.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 16b276c9..009c57aa 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -706,12 +706,13 @@ def validate_json( with OverlayDB(mdb) as overlay_db: print(f"Inserting documents into the OverlayDB.") for collection_name, documents_to_insert in docs.items(): - try: - overlay_db.replace_or_insert_many( - collection_name, documents_to_insert - ) - except OverlayDBError as error: - validation_errors[collection_name].append(str(error)) + if len(documents_to_insert) > 0: + try: + overlay_db.replace_or_insert_many( + collection_name, documents_to_insert + ) + except OverlayDBError as error: + validation_errors[collection_name].append(str(error)) # Now that the OverlayDB contains all the specified documents, we will check whether # every document referenced by any of the inserted documents exists. From 62fb1a46baf49bfcb9b6529ebf3fe785f74204a7 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 17:45:59 -0800 Subject: [PATCH 41/70] Update automated tests to exercise referential integrity checking --- tests/test_the_util/test_the_util.py | 108 ++++++++++++++++++++------- 1 file changed, 82 insertions(+), 26 deletions(-) diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py index ba09d2dd..d03b9642 100644 --- a/tests/test_the_util/test_the_util.py +++ b/tests/test_the_util/test_the_util.py @@ -2,42 +2,47 @@ from nmdc_runtime.util import validate_json # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, -# by issuing the following command from the root directory of the repository: +# by issuing the following command from the root directory of the repository within the `fastapi` container: # ``` # $ pytest tests/test_the_util/test_the_util.py # ``` +# Define a reusable dictionary that matches the value the `validate_json` function +# returns when it considers the input to be valid. +ok_result = {"result": "All Okay!"} + def test_validate_json(): # Get a reference to the MongoDB database, since the `validate_json` function requires # it to be passed in as a parameter. mdb = get_mongo_db() - # Define a reusable dictionary that matches the value the `validate_json` function - # returns when it considers the input to be valid. - ok_result = {"result": "All Okay!"} - # Test: An empty outer dictionary is valid. database_dict = {} - result = validate_json(in_docs=database_dict, mdb=mdb) - assert result == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result # Test: An empty collection is valid. database_dict = {"study_set": []} - result = validate_json(in_docs=database_dict, mdb=mdb) - assert result == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result # Test: The function reports an error for a schema-defiant collection name. database_dict = {"OTHER_set": []} result = validate_json(in_docs=database_dict, mdb=mdb) assert result["result"] == "errors" - assert "OTHER_set" in result["detail"] + assert len(result["detail"]["OTHER_set"]) == 1 + # + # Invoke the function-under-test again, but with referential integrity checking enabled. + # + result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + assert result["result"] == "errors" assert len(result["detail"]["OTHER_set"]) == 1 # Test: Two empty collections is valid. database_dict = {"biosample_set": [], "study_set": []} - result = validate_json(in_docs=database_dict, mdb=mdb) - assert result == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result # Test: A schema-compliant document is valid. database_dict = { @@ -49,41 +54,66 @@ def test_validate_json(): } ] } - result = validate_json(in_docs=database_dict, mdb=mdb) - assert result == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result - # Test: Multiple schema-compliant documents are valid. + # Test: A schema-defiant document is invalid. database_dict = { "study_set": [ { - "id": "nmdc:sty-00-000001", + "id": "nmdc:OTHER-00-000001", "type": "nmdc:Study", "study_category": "research_study", }, + ] + } + result = validate_json(in_docs=database_dict, mdb=mdb) + assert result["result"] == "errors" + assert len(result["detail"]["study_set"]) == 1 + # + # Invoke the function-under-test again, but with referential integrity checking enabled. + # + result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + assert result["result"] == "errors" + assert len(result["detail"]["study_set"]) == 1 + + # Test: An otherwise schema-compliant document that references a non-existent document is valid when referential + # integrity checking is disabled, and is invalid when referential integrity checking is enabled. + database_dict = { + "study_set": [ { - "id": "nmdc:sty-00-000002", + "id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study", - }, + "part_of": ["nmdc:sty-00-999999"], # identifies a non-existent study + } ] } - result = validate_json(in_docs=database_dict, mdb=mdb) - assert result == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result + # + # Invoke the function-under-test again, but with referential integrity checking enabled. + # + result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + assert len(result["detail"]["study_set"]) == 1 + assert "nmdc:sty-00-000001" in result["detail"]["study_set"][0] - # Test: The function reports an error for the schema-defiant document. + # Test: Multiple schema-compliant documents are valid. database_dict = { "study_set": [ { - "id": "nmdc:OTHER-00-000001", + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + { + "id": "nmdc:sty-00-000002", "type": "nmdc:Study", "study_category": "research_study", }, ] } - result = validate_json(in_docs=database_dict, mdb=mdb) - assert result["result"] == "errors" - assert "study_set" in result["detail"] - assert len(result["detail"]["study_set"]) == 1 + assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result + assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result # Test: The function reports an error for each schema-defiant document. database_dict = { @@ -104,3 +134,29 @@ def test_validate_json(): assert result["result"] == "errors" assert "study_set" in result["detail"] assert len(result["detail"]["study_set"]) == 2 + # + # Invoke the function-under-test again, but with referential integrity checking enabled. + # + result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 2 + + # Test: A single request can add a document that references another document added via the same request. The + # referential integrity checker performs its check on the _final_ result of all requested operations across + # all collections. + database_dict = { + "biosample_set": [ + { + "id": "nmdc:bsm-00-000001", + "type": "nmdc:Biosample", + "associated_studies": ["nmdc:sty-00-000001"], + "env_broad_scale": {"term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, "type": "nmdc:ControlledIdentifiedTermValue"}, "env_local_scale": {"term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, "type": "nmdc:ControlledIdentifiedTermValue"}, "env_medium": {"term": {"type": "nmdc:OntologyClass", "id": "ENVO:000000"}, "type": "nmdc:ControlledIdentifiedTermValue"} + } + ], + "study_set": [ + {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"}, + {"id": "nmdc:sty-00-000002", "type": "nmdc:Study", "study_category": "research_study", "part_of": ["nmdc:sty-00-000001"]} + ] + } + assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result From 776bc3479f58b621591e33615ff9812bca230c7c Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 18:52:18 -0800 Subject: [PATCH 42/70] Handle collection named `@type` whose documents are strings, not dicts I don't know the use case here. It is a use case that the preexisting validation stages allowed for. I do not see any documentation about it. --- nmdc_runtime/util.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 009c57aa..89478cf2 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -661,6 +661,8 @@ def validate_json( known_coll_names = set(nmdc_database_collection_names()) for coll_name, coll_docs in docs.items(): if coll_name not in known_coll_names: + # FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name). + # See: https://github.com/microbiomedata/nmdc-runtime/discussions/858 if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"): continue else: @@ -695,6 +697,10 @@ def validate_json( # Third pass (if enabled): Check inter-document references. if check_inter_document_references is True: + def is_dict(v) -> bool: + r"""Helper function that determines whether the specified value is a dictionary.""" + return isinstance(v, dict) + # Insert all documents specified for all collections specified, into the OverlayDB. # # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, @@ -705,7 +711,20 @@ def validate_json( # with OverlayDB(mdb) as overlay_db: print(f"Inserting documents into the OverlayDB.") - for collection_name, documents_to_insert in docs.items(): + for collection_name, raw_documents_to_insert in docs.items(): + # Filter out documents that are strings instead of dictionaries. + # + # Note: This is to work around the fact that the previous validation stages allow for the + # request payload to specify a collection named "@type" whose value is a string, as + # opposed to a dictionary. I don't know why they allow that. I posed the question in this + # GitHub Discussion: https://github.com/microbiomedata/nmdc-runtime/discussions/858 + # For now, I am filtering out documents that are not dictionaries, and logging a message. + # + documents_to_insert = list(filter(is_dict, raw_documents_to_insert)) + if len(raw_documents_to_insert) - len(documents_to_insert) > 0: + print(f"Filtered out documents that were not dictionaries.") + + # If any documents survived that filtering stage, insert them. if len(documents_to_insert) > 0: try: overlay_db.replace_or_insert_many( @@ -721,7 +740,15 @@ def validate_json( reference_field_names_by_source_class_name = ( references.get_reference_field_names_by_source_class_name() ) - for source_collection_name, documents_inserted in docs.items(): + for source_collection_name, raw_documents_inserted in docs.items(): + # Filter out documents that are strings instead of dictionaries. + # + # Note: Again, this is to work around the fact that the previous validation stages allow for the + # request payload to specify a collection named "@type" whose value is a string, as + # opposed to a dictionary. + # + documents_inserted = list(filter(is_dict, raw_documents_inserted)) + # Check the referential integrity of the replaced or inserted documents. # # Note: Much of this code was copy/pasted from refscan, at: From 99826e757669d0c753a707e34ae51b324c4befd1 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 02:52:51 +0000 Subject: [PATCH 43/70] style: reformat --- nmdc_runtime/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 89478cf2..b02a532f 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -697,6 +697,7 @@ def validate_json( # Third pass (if enabled): Check inter-document references. if check_inter_document_references is True: + def is_dict(v) -> bool: r"""Helper function that determines whether the specified value is a dictionary.""" return isinstance(v, dict) From dca5b82665dd32bf90c1ff782d8553e91e72efc3 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 18:59:09 -0800 Subject: [PATCH 44/70] Clarify comments --- nmdc_runtime/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index b02a532f..9c42c0e8 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -716,8 +716,8 @@ def is_dict(v) -> bool: # Filter out documents that are strings instead of dictionaries. # # Note: This is to work around the fact that the previous validation stages allow for the - # request payload to specify a collection named "@type" whose value is a string, as - # opposed to a dictionary. I don't know why they allow that. I posed the question in this + # request payload to specify a collection named "@type" whose value is a string, as opposed + # to a list of dictionaries. I don't know why they allow that. I posed the question in this # GitHub Discussion: https://github.com/microbiomedata/nmdc-runtime/discussions/858 # For now, I am filtering out documents that are not dictionaries, and logging a message. # @@ -745,8 +745,8 @@ def is_dict(v) -> bool: # Filter out documents that are strings instead of dictionaries. # # Note: Again, this is to work around the fact that the previous validation stages allow for the - # request payload to specify a collection named "@type" whose value is a string, as - # opposed to a dictionary. + # request payload to specify a collection named "@type" whose value is a string, as opposed + # to a list of dictionaries. # documents_inserted = list(filter(is_dict, raw_documents_inserted)) From 7eba7619e4f869517be972b53633298ab99eb93b Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 19:12:11 -0800 Subject: [PATCH 45/70] Distinguish list from non-list values (to support `@type` collection) --- nmdc_runtime/util.py | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 9c42c0e8..55dfbb44 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -697,11 +697,6 @@ def validate_json( # Third pass (if enabled): Check inter-document references. if check_inter_document_references is True: - - def is_dict(v) -> bool: - r"""Helper function that determines whether the specified value is a dictionary.""" - return isinstance(v, dict) - # Insert all documents specified for all collections specified, into the OverlayDB. # # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, @@ -712,21 +707,20 @@ def is_dict(v) -> bool: # with OverlayDB(mdb) as overlay_db: print(f"Inserting documents into the OverlayDB.") - for collection_name, raw_documents_to_insert in docs.items(): - # Filter out documents that are strings instead of dictionaries. + for collection_name, documents_to_insert in docs.items(): + # Insert the documents into the OverlayDB. # - # Note: This is to work around the fact that the previous validation stages allow for the - # request payload to specify a collection named "@type" whose value is a string, as opposed - # to a list of dictionaries. I don't know why they allow that. I posed the question in this - # GitHub Discussion: https://github.com/microbiomedata/nmdc-runtime/discussions/858 - # For now, I am filtering out documents that are not dictionaries, and logging a message. + # Note: The `isinstance(..., list)` check is here to work around the fact that the previous + # validation stages allow for the request payload to specify a collection named "@type" whose + # value is a string, as opposed to a list of dictionaries. # - documents_to_insert = list(filter(is_dict, raw_documents_to_insert)) - if len(raw_documents_to_insert) - len(documents_to_insert) > 0: - print(f"Filtered out documents that were not dictionaries.") - - # If any documents survived that filtering stage, insert them. - if len(documents_to_insert) > 0: + # I don't know why they allow that. I posed the question in this GitHub Discussion: + # https://github.com/microbiomedata/nmdc-runtime/discussions/858 + # + # The `len(...) > 0` check is here because pymongo complains when `insert_many` is called + # with an empty list. + # + if isinstance(documents_to_insert, list) and len(documents_to_insert) > 0: try: overlay_db.replace_or_insert_many( collection_name, documents_to_insert @@ -741,14 +735,11 @@ def is_dict(v) -> bool: reference_field_names_by_source_class_name = ( references.get_reference_field_names_by_source_class_name() ) - for source_collection_name, raw_documents_inserted in docs.items(): - # Filter out documents that are strings instead of dictionaries. - # - # Note: Again, this is to work around the fact that the previous validation stages allow for the - # request payload to specify a collection named "@type" whose value is a string, as opposed - # to a list of dictionaries. - # - documents_inserted = list(filter(is_dict, raw_documents_inserted)) + for source_collection_name, documents_inserted in docs.items(): + # If `documents_inserted` is not a list (which is a scenario that the previous validation stages + # allow), abort processing this collection and proceed to processing the next collection. + if not isinstance(documents_inserted, list): + continue # Check the referential integrity of the replaced or inserted documents. # From 999ea088eec8139567a4ffcc1bd9bdb310675e94 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 03:14:40 +0000 Subject: [PATCH 46/70] style: reformat --- nmdc_runtime/util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 55dfbb44..1ecd88cc 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -720,7 +720,10 @@ def validate_json( # The `len(...) > 0` check is here because pymongo complains when `insert_many` is called # with an empty list. # - if isinstance(documents_to_insert, list) and len(documents_to_insert) > 0: + if ( + isinstance(documents_to_insert, list) + and len(documents_to_insert) > 0 + ): try: overlay_db.replace_or_insert_many( collection_name, documents_to_insert From c981e1fafc32ebf6d584788c9aaa8c145f3a2ee3 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 20:40:51 -0800 Subject: [PATCH 47/70] Refactor: Split large unit test into multiple smaller ones --- tests/test_the_util/test_the_util.py | 146 ++++++++++++++------------- 1 file changed, 78 insertions(+), 68 deletions(-) diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py index d03b9642..cf2ea291 100644 --- a/tests/test_the_util/test_the_util.py +++ b/tests/test_the_util/test_the_util.py @@ -1,3 +1,5 @@ +import pytest + from nmdc_runtime.api.db.mongo import get_mongo_db from nmdc_runtime.util import validate_json @@ -11,40 +13,47 @@ # returns when it considers the input to be valid. ok_result = {"result": "All Okay!"} +# Make a concise alias whose items we can unpack (via `**check_refs`) into `kwargs` +# when invoking the `validate_json` function in our tests. +check_refs = dict(check_inter_document_references=True) + + +@pytest.fixture +def db(): + r"""Returns a reference to the MongoDB database specified by environment variables.""" + return get_mongo_db() + -def test_validate_json(): - # Get a reference to the MongoDB database, since the `validate_json` function requires - # it to be passed in as a parameter. - mdb = get_mongo_db() +def test_validate_json_returns_valid_when_input_is_empty_dictionary(db): + assert validate_json({}, mdb=db) == ok_result + assert validate_json({}, mdb=db, **check_refs) == ok_result - # Test: An empty outer dictionary is valid. - database_dict = {} - assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result - assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result - # Test: An empty collection is valid. +def test_validate_json_returns_valid_when_collection_is_empty_list(db): database_dict = {"study_set": []} - assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result - assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result + assert validate_json(in_docs=database_dict, mdb=db) == ok_result + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result - # Test: The function reports an error for a schema-defiant collection name. + +def test_validate_json_returns_invalid_when_collection_name_is_schema_defiant(db): database_dict = {"OTHER_set": []} - result = validate_json(in_docs=database_dict, mdb=mdb) + result = validate_json(in_docs=database_dict, mdb=db) assert result["result"] == "errors" assert len(result["detail"]["OTHER_set"]) == 1 - # + # Invoke the function-under-test again, but with referential integrity checking enabled. - # - result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + result = validate_json(in_docs=database_dict, mdb=db, **check_refs) assert result["result"] == "errors" assert len(result["detail"]["OTHER_set"]) == 1 - # Test: Two empty collections is valid. + +def test_validate_json_returns_valid_when_payload_has_multiple_empty_collections(db): database_dict = {"biosample_set": [], "study_set": []} - assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result - assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result + assert validate_json(in_docs=database_dict, mdb=db) == ok_result + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result - # Test: A schema-compliant document is valid. + +def test_validate_json_returns_valid_when_the_only_document_is_schema_compliant(db): database_dict = { "study_set": [ { @@ -54,31 +63,50 @@ def test_validate_json(): } ] } - assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result - assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result + assert validate_json(in_docs=database_dict, mdb=db) == ok_result + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result + + +def test_validate_json_returns_valid_when_all_documents_are_schema_compliant(db): + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + }, + { + "id": "nmdc:sty-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + }, + ] + } + assert validate_json(in_docs=database_dict, mdb=db) == ok_result + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result + - # Test: A schema-defiant document is invalid. +def test_validate_json_returns_invalid_when_document_is_schema_defiant(db): database_dict = { "study_set": [ { - "id": "nmdc:OTHER-00-000001", + "id": "nmdc:OTHER-00-000001", # invalid string format "type": "nmdc:Study", "study_category": "research_study", }, ] } - result = validate_json(in_docs=database_dict, mdb=mdb) + result = validate_json(in_docs=database_dict, mdb=db) assert result["result"] == "errors" assert len(result["detail"]["study_set"]) == 1 - # + # Invoke the function-under-test again, but with referential integrity checking enabled. - # - result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + result = validate_json(in_docs=database_dict, mdb=db, **check_refs) assert result["result"] == "errors" assert len(result["detail"]["study_set"]) == 1 - # Test: An otherwise schema-compliant document that references a non-existent document is valid when referential - # integrity checking is disabled, and is invalid when referential integrity checking is enabled. + +def test_validate_json_returns_invalid_when_otherwise_schema_compliant_document_references_missing_document(db): database_dict = { "study_set": [ { @@ -89,62 +117,44 @@ def test_validate_json(): } ] } - assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result - # + assert validate_json(in_docs=database_dict, mdb=db) == ok_result + # Invoke the function-under-test again, but with referential integrity checking enabled. - # - result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + result = validate_json(in_docs=database_dict, mdb=db, **check_refs) assert len(result["detail"]["study_set"]) == 1 assert "nmdc:sty-00-000001" in result["detail"]["study_set"][0] - # Test: Multiple schema-compliant documents are valid. - database_dict = { - "study_set": [ - { - "id": "nmdc:sty-00-000001", - "type": "nmdc:Study", - "study_category": "research_study", - }, - { - "id": "nmdc:sty-00-000002", - "type": "nmdc:Study", - "study_category": "research_study", - }, - ] - } - assert validate_json(in_docs=database_dict, mdb=mdb) == ok_result - assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result - # Test: The function reports an error for each schema-defiant document. +def test_validate_json_does_not_check_references_if_documents_are_schema_defiant(db): database_dict = { "study_set": [ { - "id": "nmdc:OTHER-00-000001", - "type": "nmdc:Study", - "study_category": "research_study", - }, - { - "id": "nmdc:OTHER-00-000002", + "id": "nmdc:OTHER-00-000001", # invalid string format "type": "nmdc:Study", "study_category": "research_study", + "part_of": ["nmdc:sty-00-000009"], # identifies a non-existent study }, ] } - result = validate_json(in_docs=database_dict, mdb=mdb) + result = validate_json(in_docs=database_dict, mdb=db) assert result["result"] == "errors" assert "study_set" in result["detail"] - assert len(result["detail"]["study_set"]) == 2 - # + assert len(result["detail"]["study_set"]) == 1 + # Invoke the function-under-test again, but with referential integrity checking enabled. - # - result = validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) + result = validate_json(in_docs=database_dict, mdb=db, **check_refs) assert result["result"] == "errors" assert "study_set" in result["detail"] - assert len(result["detail"]["study_set"]) == 2 + assert len(result["detail"]["study_set"]) == 1 # not 2 + - # Test: A single request can add a document that references another document added via the same request. The - # referential integrity checker performs its check on the _final_ result of all requested operations across - # all collections. +def test_validate_json_checks_referential_integrity_after_applying_all_collections_changes(db): + r""" + Note: This test targets the scenario where a single payload introduces both the source document and target document + of a given reference, and those documents reside in different collections. If the referential integrity + checker were to performs a check after each individual collection's changes had been applied, it would not + find referenced documents that hadn't been introduced into the database yet. + """ database_dict = { "biosample_set": [ { @@ -159,4 +169,4 @@ def test_validate_json(): {"id": "nmdc:sty-00-000002", "type": "nmdc:Study", "study_category": "research_study", "part_of": ["nmdc:sty-00-000001"]} ] } - assert validate_json(in_docs=database_dict, mdb=mdb, check_inter_document_references=True) == ok_result + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result From 90c7070eaee707633867bc2823b05373c8ca5ede Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 13 Jan 2025 20:48:05 -0800 Subject: [PATCH 48/70] Clarify comments --- nmdc_runtime/util.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 1ecd88cc..c93ba35c 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -714,7 +714,7 @@ def validate_json( # validation stages allow for the request payload to specify a collection named "@type" whose # value is a string, as opposed to a list of dictionaries. # - # I don't know why they allow that. I posed the question in this GitHub Discussion: + # I don't know why those stages do that. I posed the question in this GitHub Discussion: # https://github.com/microbiomedata/nmdc-runtime/discussions/858 # # The `len(...) > 0` check is here because pymongo complains when `insert_many` is called @@ -745,10 +745,6 @@ def validate_json( continue # Check the referential integrity of the replaced or inserted documents. - # - # Note: Much of this code was copy/pasted from refscan, at: - # https://github.com/microbiomedata/refscan/blob/46daba3b3cd05ee6a8a91076515f737248328cdb/refscan/refscan.py#L286-L349 - # print( f"Checking references emanating from documents inserted into '{source_collection_name}'." ) From d540afbf6d965855ee275d0b5a9aa8274ce11d3f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 Jan 2025 21:26:13 -0800 Subject: [PATCH 49/70] Fix bug where `validate_json` ref. int. check ignored existing documents --- nmdc_runtime/util.py | 108 ++++++++++++--------------- tests/test_the_util/test_the_util.py | 47 +++++++++++- 2 files changed, 95 insertions(+), 60 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index c93ba35c..81f1fdc2 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -697,69 +697,59 @@ def validate_json( # Third pass (if enabled): Check inter-document references. if check_inter_document_references is True: - # Insert all documents specified for all collections specified, into the OverlayDB. + # Prepare to use `refscan`. # - # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, - # instead, validate it after processing _each_ collection, we would get a false positive if a document - # inserted into an earlier-processed collection happened to reference a document slated for insertion - # into a later-processed collection. By waiting until all documents in all collections specified have - # been inserted, we avoid that situation. + # Note: We check the inter-document references in two stages, which are: + # 1. For each document in the JSON payload, check whether each document it references already exists + # (in the collections the schema says it can exist in) in the database. We use the + # `refscan` package to do this, which returns violation details we'll use in the second stage. + # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we + # check whether that document exists (in the collections the schema says it can exist in) in the + # JSON payload. If it does, then we "waive" (i.e. discard) that violation. + # The violations that remain after those two stages are the ones we return to the caller. # - with OverlayDB(mdb) as overlay_db: - print(f"Inserting documents into the OverlayDB.") - for collection_name, documents_to_insert in docs.items(): - # Insert the documents into the OverlayDB. - # - # Note: The `isinstance(..., list)` check is here to work around the fact that the previous - # validation stages allow for the request payload to specify a collection named "@type" whose - # value is a string, as opposed to a list of dictionaries. - # - # I don't know why those stages do that. I posed the question in this GitHub Discussion: - # https://github.com/microbiomedata/nmdc-runtime/discussions/858 - # - # The `len(...) > 0` check is here because pymongo complains when `insert_many` is called - # with an empty list. - # - if ( - isinstance(documents_to_insert, list) - and len(documents_to_insert) > 0 - ): - try: - overlay_db.replace_or_insert_many( - collection_name, documents_to_insert - ) - except OverlayDBError as error: - validation_errors[collection_name].append(str(error)) - - # Now that the OverlayDB contains all the specified documents, we will check whether - # every document referenced by any of the inserted documents exists. - finder = Finder(database=overlay_db) - references = get_allowed_references() - reference_field_names_by_source_class_name = ( - references.get_reference_field_names_by_source_class_name() - ) - for source_collection_name, documents_inserted in docs.items(): - # If `documents_inserted` is not a list (which is a scenario that the previous validation stages - # allow), abort processing this collection and proceed to processing the next collection. - if not isinstance(documents_inserted, list): - continue - - # Check the referential integrity of the replaced or inserted documents. - print( - f"Checking references emanating from documents inserted into '{source_collection_name}'." + finder = Finder(database=mdb) + references = get_allowed_references() + reference_field_names_by_source_class_name = ( + references.get_reference_field_names_by_source_class_name() + ) + + # Iterate over the collections in the JSON payload. + for source_collection_name, documents in in_docs.items(): + print(f"Checking inter-document references for {source_collection_name}") + for document in documents: + source_document = dict(document, _id=None) # adds `_id` field, since `refscan` requires it to exist + violations = scan_outgoing_references( + document=source_document, + schema_view=nmdc_schema_view(), + reference_field_names_by_source_class_name=reference_field_names_by_source_class_name, + references=references, + finder=finder, + collection_names=nmdc_database_collection_names(), + source_collection_name=source_collection_name, + user_wants_to_locate_misplaced_documents=False, ) - for document in documents_inserted: - violations = scan_outgoing_references( - document=document, - schema_view=nmdc_schema_view(), - reference_field_names_by_source_class_name=reference_field_names_by_source_class_name, - references=references, - finder=finder, - collection_names=nmdc_database_collection_names(), - source_collection_name=source_collection_name, - user_wants_to_locate_misplaced_documents=False, + + # For each violation, check whether the misplaced document is in the JSON payload, itself. + for violation in violations: + print(f"Checking whether inter-document referential integrity violation can be waived.") + can_waive_violation = False + # Determine which collections can contain the referenced document, based upon + # the schema class of which this source document is an instance. + target_collection_names = references.get_target_collection_names( + source_class_name=violation.source_class_name, + source_field_name=violation.source_field_name, ) - for violation in violations: + # Check whether the referenced document exists in any of those collections in the JSON payload. + for in_collection_name, in_collection_docs in in_docs.items(): + if in_collection_name in target_collection_names: + for in_collection_doc in in_collection_docs: + if in_collection_doc.get("id") == violation.target_id: + can_waive_violation = True + break # stop checking + if can_waive_violation: + break # stop checking + if not can_waive_violation: violation_as_str = ( f"Document '{violation.source_document_id}' " f"in collection '{violation.source_collection_name}' " diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py index cf2ea291..3de72d36 100644 --- a/tests/test_the_util/test_the_util.py +++ b/tests/test_the_util/test_the_util.py @@ -6,7 +6,7 @@ # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, # by issuing the following command from the root directory of the repository within the `fastapi` container: # ``` -# $ pytest tests/test_the_util/test_the_util.py +# $ pytest -vv tests/test_the_util/test_the_util.py # ``` # Define a reusable dictionary that matches the value the `validate_json` function @@ -148,6 +148,23 @@ def test_validate_json_does_not_check_references_if_documents_are_schema_defiant assert len(result["detail"]["study_set"]) == 1 # not 2 +def test_validate_json_reports_multiple_broken_references_emanating_from_single_document(db): + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + "part_of": ["nmdc:sty-00-000008", "nmdc:sty-00-000009"], # identifies 2 non-existent studies + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=db, **check_refs) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 2 + + def test_validate_json_checks_referential_integrity_after_applying_all_collections_changes(db): r""" Note: This test targets the scenario where a single payload introduces both the source document and target document @@ -170,3 +187,31 @@ def test_validate_json_checks_referential_integrity_after_applying_all_collectio ] } assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result + + +def test_validate_json_considers_existing_documents_when_checking_references(db): + r""" + Note: This test focuses on the case where the database already contains the to-be-referenced document; + as opposed to the to-be-referenced document being introduced via the same request payload. + For that reason, we will seed the database before calling the function-under-test. + """ + existing_study_id = "nmdc:sty-00-000001" + + db.get_collection("study_set").replace_one( + {"id": existing_study_id}, + {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"}, + upsert=True + ) + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + "part_of": [existing_study_id], # identifies the existing study + }, + ] + } + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result + + db.get_collection("study_set").delete_one({"id": existing_study_id}) From 08b6fefef7f87ab1f11f765da3b8cfe8f7ddce75 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 Jan 2025 21:27:48 -0800 Subject: [PATCH 50/70] Delete defective class method, which is also now obsolete --- nmdc_runtime/util.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 81f1fdc2..0ad164bc 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -570,13 +570,6 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._bottom_db.client.drop_database(self._top_db.name) - def get_collection(self, coll_name: str): - r"""Returns a reference to the specified collection.""" - try: - return self._top_db[coll_name] - except OperationFailure as e: - raise OverlayDBError(str(e.details)) - def replace_or_insert_many(self, coll_name, documents: list): try: self._top_db[coll_name].insert_many(documents) From 592d837d00fdbd3c169ac248203f4b83ab530458 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 16 Jan 2025 05:39:05 +0000 Subject: [PATCH 51/70] style: reformat --- nmdc_runtime/util.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 0ad164bc..35e5ecfb 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -709,9 +709,13 @@ def validate_json( # Iterate over the collections in the JSON payload. for source_collection_name, documents in in_docs.items(): - print(f"Checking inter-document references for {source_collection_name}") + print( + f"Checking inter-document references for {source_collection_name}" + ) for document in documents: - source_document = dict(document, _id=None) # adds `_id` field, since `refscan` requires it to exist + source_document = dict( + document, _id=None + ) # adds `_id` field, since `refscan` requires it to exist violations = scan_outgoing_references( document=source_document, schema_view=nmdc_schema_view(), @@ -725,19 +729,26 @@ def validate_json( # For each violation, check whether the misplaced document is in the JSON payload, itself. for violation in violations: - print(f"Checking whether inter-document referential integrity violation can be waived.") + print( + f"Checking whether inter-document referential integrity violation can be waived." + ) can_waive_violation = False # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. - target_collection_names = references.get_target_collection_names( - source_class_name=violation.source_class_name, - source_field_name=violation.source_field_name, + target_collection_names = ( + references.get_target_collection_names( + source_class_name=violation.source_class_name, + source_field_name=violation.source_field_name, + ) ) # Check whether the referenced document exists in any of those collections in the JSON payload. for in_collection_name, in_collection_docs in in_docs.items(): if in_collection_name in target_collection_names: for in_collection_doc in in_collection_docs: - if in_collection_doc.get("id") == violation.target_id: + if ( + in_collection_doc.get("id") + == violation.target_id + ): can_waive_violation = True break # stop checking if can_waive_violation: From 2f65bbec451ea4cb409cf124fc935cc82a546dae Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 Jan 2025 21:44:51 -0800 Subject: [PATCH 52/70] Simplify variable names and remove print statements --- nmdc_runtime/util.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 35e5ecfb..eafdf274 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -709,13 +709,9 @@ def validate_json( # Iterate over the collections in the JSON payload. for source_collection_name, documents in in_docs.items(): - print( - f"Checking inter-document references for {source_collection_name}" - ) for document in documents: - source_document = dict( - document, _id=None - ) # adds `_id` field, since `refscan` requires it to exist + # Add an `_id` field to the document, since `refscan` requires the document to have one. + source_document = dict(document, _id=None) violations = scan_outgoing_references( document=source_document, schema_view=nmdc_schema_view(), @@ -729,9 +725,6 @@ def validate_json( # For each violation, check whether the misplaced document is in the JSON payload, itself. for violation in violations: - print( - f"Checking whether inter-document referential integrity violation can be waived." - ) can_waive_violation = False # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. @@ -742,13 +735,10 @@ def validate_json( ) ) # Check whether the referenced document exists in any of those collections in the JSON payload. - for in_collection_name, in_collection_docs in in_docs.items(): - if in_collection_name in target_collection_names: - for in_collection_doc in in_collection_docs: - if ( - in_collection_doc.get("id") - == violation.target_id - ): + for json_coll_name, json_coll_docs in in_docs.items(): + if json_coll_name in target_collection_names: + for json_coll_doc in json_coll_docs: + if json_coll_doc["id"] == violation.target_id: can_waive_violation = True break # stop checking if can_waive_violation: From 236c8115dc8e57f412367de645cfa7cab955e24f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 00:22:27 -0800 Subject: [PATCH 53/70] Replace literal string with variable to emphasize sameness --- tests/test_the_util/test_the_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py index 3de72d36..0fdc09a2 100644 --- a/tests/test_the_util/test_the_util.py +++ b/tests/test_the_util/test_the_util.py @@ -199,7 +199,7 @@ def test_validate_json_considers_existing_documents_when_checking_references(db) db.get_collection("study_set").replace_one( {"id": existing_study_id}, - {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"}, + {"id": existing_study_id, "type": "nmdc:Study", "study_category": "research_study"}, upsert=True ) database_dict = { From 4ce19fab5ee910f0967f714b8e235198953152f6 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 11:47:01 -0800 Subject: [PATCH 54/70] Document limitation of `OverlayDB` based upon my recent misunderstanding --- nmdc_runtime/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index eafdf274..9911e024 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -542,6 +542,13 @@ class OverlayDB(AbstractContextManager): overlay collection, that id is marked as "seen" and will not also be returned when subsequently scanning the (unmodified) base-database collection. + Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged" + database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via + `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to + the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the + "merging" just-in-time to process the method invocation. You can see an example of this in the implementation + of the `merge_find` method, which internally accesses both the real database and the overlaying database. + Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected documents from a base collection to the overlay, and then applying the updates to the overlay, so that again, base collections are unmodified, and a "merge_find" call will produce a result From 22943236a98b841d218b05b9d1370064005e7950 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 11:55:25 -0800 Subject: [PATCH 55/70] Add comment explaining rationale for not using `OverlayDB` class --- nmdc_runtime/util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 9911e024..c393ccca 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -708,6 +708,11 @@ def validate_json( # JSON payload. If it does, then we "waive" (i.e. discard) that violation. # The violations that remain after those two stages are the ones we return to the caller. # + # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB` + # does not provide a means to perform arbitrary queries against its virtual "merged" database. It + # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that + # `refscan`'s `Finder` class accepts. + # finder = Finder(database=mdb) references = get_allowed_references() reference_field_names_by_source_class_name = ( From 5ab16c186a801e94508a2470f7a7522247c1b88e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 12:09:46 -0800 Subject: [PATCH 56/70] Add user-facing documentation about `/metadata/json:validate` behavior --- nmdc_runtime/api/endpoints/metadata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py index 9b6288fe..ea89027d 100644 --- a/nmdc_runtime/api/endpoints/metadata.py +++ b/nmdc_runtime/api/endpoints/metadata.py @@ -168,10 +168,13 @@ def iter_grid_out(): @router.post("/metadata/json:validate", name="Validate JSON") async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)): - """ - + r""" Validate a NMDC JSON Schema "nmdc:Database" object. + This API endpoint validates the JSON payload in two steps. The first step is to check the format of each document + (e.g., the presence, name, and value of each field). If it encounters any violations during that step, it will not + proceed to the second step. The second step is to check whether all documents referenced by the document exist, + whether in the database or the same JSON payload. We call the second step a "referential integrity check." """ return validate_json(docs, mdb, check_inter_document_references=True) From 2e1a48dbe7795cb9d111bf907f73108c7c768e8f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 23:43:32 -0800 Subject: [PATCH 57/70] Begin writing Python notebook demonstrating general usage of Runtime API --- docs/nb/general_usage.ipynb | 357 ++++++++++++++++++++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100644 docs/nb/general_usage.ipynb diff --git a/docs/nb/general_usage.ipynb b/docs/nb/general_usage.ipynb new file mode 100644 index 00000000..8e8fcf4a --- /dev/null +++ b/docs/nb/general_usage.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8eadc33e15fcf211", + "metadata": {}, + "source": [ + "# Using the NMDC Runtime API\n", + "\n", + "## Introduction\n", + "\n", + "In this tutorial, I'll show you how you can use a Python notebook to interact with the NMDC Runtime API.\n", + "\n", + "Specifically, I'll show you how you can use a Python notebook to (a) submit HTTP requests, (b) parse HTTP responses, and (c) authenticate an HTTP client.\n", + "\n", + "## Getting help\n", + "\n", + "In case you have questions about the contents of this notebook, you can post them as [GitHub issues](https://github.com/microbiomedata/nmdc-runtime/issues/new) in the `microbiomedata/nmdc-runtime` GitHub repository, in which this notebook resides. NMDC team members regularly review open issues. In case you don't have a GitHub account, you can email your questions to the [NMDC Support Team](mailto:support@microbiomedata.org)." + ] + }, + { + "cell_type": "markdown", + "id": "39cbe06680a4916d", + "metadata": {}, + "source": [ + "## 1. Install dependencies\n", + "\n", + "Before you can access the NMDC Runtime API—which runs as an HTTP service—you'll need an HTTP client. A popular HTTP client for Python is called `requests`. You can install it on your computer by running the following cell:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "49fe57573c851cba", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T06:36:26.872098Z", + "start_time": "2025-01-17T06:36:25.703022Z" + } + }, + "outputs": [], + "source": [ + "%pip install requests" + ] + }, + { + "cell_type": "markdown", + "id": "c2d17a26f190dcc0", + "metadata": {}, + "source": [ + "Now that the `requests` package is installed, you can use it to send HTTP requests to HTTP servers. For example, you can run the following cell to submit an HTTP GET request to an example HTTP server:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "62d28232a8221431", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T06:46:09.787642Z", + "start_time": "2025-01-17T06:46:09.613677Z" + } + }, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "# Submit an HTTP GET request to an example HTTP server.\n", + "response = requests.get(\"https://jsonplaceholder.typicode.com/posts/1\")" + ] + }, + { + "cell_type": "markdown", + "id": "cc9a5755432a76dc", + "metadata": {}, + "source": [ + "Now that you've submitted the HTTP request, the `response` variable contains information about the HTTP response the example HTTP server sent back. You can examine it by running the following cells:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ae560ed292755cd2", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T06:46:11.147029Z", + "start_time": "2025-01-17T06:46:11.143317Z" + } + }, + "outputs": [], + "source": [ + "# Get the HTTP status code from the response.\n", + "response.status_code" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "870810045483f31f", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T06:46:12.367741Z", + "start_time": "2025-01-17T06:46:12.361007Z" + } + }, + "outputs": [], + "source": [ + "# Parse the response as a JSON string.\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "8f3cd7b73636ff0b", + "metadata": {}, + "source": [ + "If the first of those cells outputs the number `200` and the second one outputs a Python dictionary having several keys (including `id` and `title`), you are good to go!\n", + "\n", + "> In case those cells did not output those things, here are some troubleshooting tips: (1) check your Internet connection, (2) visit the same URL from the example above, in your web browser, and (3) review the [documentation](https://requests.readthedocs.io/en/latest/) of the `requests` package.\n", + "\n", + "Now that you can access _an_ HTTP server, let's access the **NMDC Runtime API**." + ] + }, + { + "cell_type": "markdown", + "id": "fd84435af1503b22", + "metadata": {}, + "source": [ + "## 2. Access an NMDC Runtime API endpoint\n", + "\n", + "The NMDC Runtime API has a variety of API endpoints that you send HTTP requests to.\n", + "\n", + "> The full list of API endpoints is listed in the NMDC Runtime API's [API documentation](https://api.microbiomedata.org/docs).\n", + "\n", + "One of the API endpoints that I like to send HTTP requests to is `/studies`. That API endpoint responds with a list of all the studies that exist in the NMDC database!\n", + "\n", + "You can run the following cell to send an HTTP GET request to that API endpoint:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8cc19fe2047322a8", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T06:57:22.368322Z", + "start_time": "2025-01-17T06:57:22.048361Z" + } + }, + "outputs": [], + "source": [ + "response = requests.get(\"https://api.microbiomedata.org/studies\")" + ] + }, + { + "cell_type": "markdown", + "id": "77d0bc630d308a46", + "metadata": {}, + "source": [ + "Now that you have received an HTTP response from the endpoint, you can examine it like before. You can see the JSON data—in this case, a list of studies—by running the code in this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e9bf89ac847d5383", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T06:58:31.537245Z", + "start_time": "2025-01-17T06:58:31.449579Z" + } + }, + "outputs": [], + "source": [ + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "ed22b648710a26c6", + "metadata": {}, + "source": [ + "Whoa! That's a lot of output. Let's break it down.\n", + "\n", + "You can run the following cell to see only its top-level properties:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "cdc717a66bfc3136", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T07:05:21.982850Z", + "start_time": "2025-01-17T07:05:21.972839Z" + } + }, + "outputs": [], + "source": [ + "response.json().keys()" + ] + }, + { + "cell_type": "markdown", + "id": "6611146f6253a742", + "metadata": {}, + "source": [ + "The `meta` property contains data _about the response_, such as pagination parameters and search filter criteria.\n", + "\n", + "The `results` property contains the requested data—in this case, a list of studies.\n", + "\n", + "You can ignore the `group_by` property. According to the NMDC Runtime API's API documentation, `group_by` is not implemented yet.\n", + "\n", + "Let's display just the `meta` property:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4f30bc5dee894252", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T07:11:23.271756Z", + "start_time": "2025-01-17T07:11:23.249253Z" + } + }, + "outputs": [], + "source": [ + "response.json()[\"meta\"]" + ] + }, + { + "cell_type": "markdown", + "id": "5eb4a11914ab2971", + "metadata": {}, + "source": [ + "According to the `meta` property, there are 32 studies in the database.\n", + "\n", + "> Note: At the time of this writing, there are 32. When you run the cell, you may see a different number. The database is constantly changing.\n", + "\n", + "Let's count the studies we received in the `results` list:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c5d6b38c5888050a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T07:13:27.936952Z", + "start_time": "2025-01-17T07:13:27.930606Z" + } + }, + "outputs": [], + "source": [ + "len(response.json()[\"results\"])" + ] + }, + { + "cell_type": "markdown", + "id": "e9b22bd4ae4f88b9", + "metadata": {}, + "source": [ + "The `results` list contains only 25 studies—as opposed to 32. That's because this endpoint uses [pagination](https://en.wikipedia.org/wiki/Pagination#In_Database), and the default page size happens to be 25.\n", + "\n", + "You can customize the page size like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "176887bd5a7c241e", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T07:17:03.432263Z", + "start_time": "2025-01-17T07:17:02.593617Z" + } + }, + "outputs": [], + "source": [ + "# Resend the same HTTP request, but include a higher page size than the default of 25.\n", + "response = requests.get(\"https://api.microbiomedata.org/studies?per_page=100\")\n", + "\n", + "# Count the studies in the `results` list.\n", + "len(response.json()[\"results\"])" + ] + }, + { + "cell_type": "markdown", + "id": "ed4c5916f7315145", + "metadata": {}, + "source": [ + "There they are!\n", + "\n", + "You can use the `per_page` parameter to customize the number of items you want to receive per HTTP response.\n", + "\n", + "You can use other parameters to customize the response in other ways, too. For example, you can run the following cell to request only studies whose `ecosystem_category` value is `Aquatic`, and request that the API response contain at most two studies." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e012062c4f4d454d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-17T07:26:20.476666Z", + "start_time": "2025-01-17T07:26:20.373037Z" + } + }, + "outputs": [], + "source": [ + "response = requests.get(\"https://api.microbiomedata.org/studies?filter=ecosystem_category:Aquatic&per_page=2&sort_by=name\")\n", + "\n", + "# Print the number of studies in the response.\n", + "print(len(response.json()[\"results\"]))\n", + "\n", + "# Print their names in the order in which they appear in the response.\n", + "for study in response.json()[\"results\"]:\n", + " print(study[\"name\"])" + ] + }, + { + "cell_type": "markdown", + "id": "eeab95830833d1f7", + "metadata": {}, + "source": [ + "**Congratulations!** You've used a Python notebook to retrieve data residing in the NMDC database, via the NMDC Runtime API. 🎉" + ] + }, + { + "cell_type": "markdown", + "id": "92c00d38", + "metadata": {}, + "source": [ + "## 3. Access a _protected_ NMDC Runtime API endpoint\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "5e75dc49", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 5, + "nbformat_minor": 9 +} From f5f78ec458bd9cdfe212ede0866189c37456b38e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 17 Jan 2025 19:15:14 -0800 Subject: [PATCH 58/70] Update GHA workflows to deploy redirects instead of MkDocs site --- .github/workflows/deploy-redirects.yml | 84 ++++++++++++++++++++++++++ .github/workflows/mkdocs.yml | 26 ++++++-- 2 files changed, 104 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/deploy-redirects.yml diff --git a/.github/workflows/deploy-redirects.yml b/.github/workflows/deploy-redirects.yml new file mode 100644 index 00000000..ae7742ff --- /dev/null +++ b/.github/workflows/deploy-redirects.yml @@ -0,0 +1,84 @@ +# This GitHub Actions workflow builds a website file tree and deploys it to GitHub Pages. +# Reference: https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions +name: Deploy redirects to GitHub Pages + +on: + push: { branches: [ main ] } + workflow_dispatch: { } + +# Reference: https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + group: github-pages + cancel-in-progress: true + +jobs: + build: + name: Build website + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Check out commit + uses: actions/checkout@v4 + - name: Define helper bash function + run: | + make_redirect() { + # This function prints the HTML markup for a web page that redirects the client. + local url="https://docs.microbiomedata.org/runtime/${1}" + echo "Redirecting to ${url}..." + } + - name: Create website file tree + run: | + # Create directories. + mkdir -p \ + _build/html \ + _build/html/explanation \ + _build/html/howto-guides \ + _build/html/howto-guides/jobs \ + _build/html/tutorials \ + _build/html/nb + + # Create HTML files containing redirects. + cd _build/html + make_redirect index.html > index.html + make_redirect admin.html > admin.html + make_redirect draft.html > draft.html + make_redirect contributing-docs.html > contributing-docs.html + make_redirect explanation/domain-vision-statement.html > explanation/domain-vision-statement.html + make_redirect explanation/identifiers.html > explanation/identifiers.html + make_redirect explanation/journeys.html > explanation/journeys.html + make_redirect howto-guides/update-sensors-ops.html > howto-guides/update-sensors-ops.html + make_redirect howto-guides/create-triggers.html > howto-guides/create-triggers.html + make_redirect howto-guides/improving-search-api.html > howto-guides/improving-search-api.html + make_redirect howto-guides/release-process.html > howto-guides/release-process.html + make_redirect howto-guides/author-changesheets.html > howto-guides/author-changesheets.html + make_redirect howto-guides/claim-and-run-jobs.html > howto-guides/claim-and-run-jobs.html + make_redirect howto-guides/jobs/gold-translation-etl.html > howto-guides/jobs/gold-translation-etl.html + make_redirect tutorials/json.html > tutorials/json.html + make_redirect tutorials/exporters.html > tutorials/exporters.html + make_redirect tutorials/metadata-in.html > tutorials/metadata-in.html + make_redirect tutorials/auth.html > tutorials/auth.html + make_redirect tutorials/translators.html > tutorials/translators.html + make_redirect nb/bulk_validation_referential_integrity_check.html > nb/bulk_validation_referential_integrity_check.html + make_redirect nb/get_data.html > nb/get_data.html + make_redirect nb/queue_and_trigger_data_jobs.html > nb/queue_and_trigger_data_jobs.html + make_redirect nb/wf_automation.html > nb/wf_automation.html + - name: Save the result for publishing to GitHub Pages + uses: actions/upload-pages-artifact@v3 + with: + path: _build/html + deploy: + name: Deploy website + needs: + - build + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index dab02ceb..ec418000 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -20,11 +20,25 @@ jobs: - run: pip install git+https://${GH_TOKEN}@github.com/squidfunk/mkdocs-material-insiders.git - run: pip install mkdocs-mermaid2-plugin - run: pip install mkdocs-jupyter - - run: mkdocs gh-deploy --force + # Build the MkDocs site (but don't deploy it). + # + # Note: Until January 17, 2025, we would build and deploy the MkDocs website to GitHub Pages. + # As of January 17, 2025, we just _build_ it — we don't deploy it. + # + # The reason is that the NMDC has a new, centralized documentation website: https://docs.microbiomedata.org. + # That website includes a copy of the Runtime documentation (it's at the path, `/runtime`). Instead of + # deploying this redundant copy (being built here) to `https://microbiomedata.github.io/nmdc-runtime`, + # we instead deploy _HTTP Redirects_ there. Those HTTP Redirects will redirect visitors from + # `https://microbiomedata.github.io/nmdc-runtime` to `https://docs.microbiomedata.org/runtime`. + # + # This redirection is not implemented in the MkDocs site, itself. That's because those source files + # are used to build the website hosted at `https://docs.microbiomedata.org/runtime`; and, if a page were + # to redirect to itself, the result would be a circular redirect. + # + # The reason we still bother to build the MkDocs site here is so we get an error (a failed GHA workflow run) + # when the site is no longer buildable (e.g. due to someone inadvertently introducing an invalid source + # file, or due to one of the dependencies becoming unavailable). + # + - run: mkdocs build env: GH_TOKEN: ${{ secrets.GH_TOKEN_MKDOCS_MATERIAL_INSIDERS }} - - - - - From 08cbc7ac56862616043829c259a3b86f41e11049 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 17 Jan 2025 19:27:21 -0800 Subject: [PATCH 59/70] Refine comment --- .github/workflows/mkdocs.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index ec418000..b96958fa 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -28,16 +28,17 @@ jobs: # The reason is that the NMDC has a new, centralized documentation website: https://docs.microbiomedata.org. # That website includes a copy of the Runtime documentation (it's at the path, `/runtime`). Instead of # deploying this redundant copy (being built here) to `https://microbiomedata.github.io/nmdc-runtime`, - # we instead deploy _HTTP Redirects_ there. Those HTTP Redirects will redirect visitors from - # `https://microbiomedata.github.io/nmdc-runtime` to `https://docs.microbiomedata.org/runtime`. + # we will (via the new `deploy-redirects.yml` workflow) deploy a bunch of HTML files that redirect visitors + # from pages on `https://microbiomedata.github.io/nmdc-runtime` to their counterparts on + # `https://docs.microbiomedata.org/runtime`. # - # This redirection is not implemented in the MkDocs site, itself. That's because those source files + # This redirection is not implemented in the MkDocs site, _itself_, because those MkDocs site source files # are used to build the website hosted at `https://docs.microbiomedata.org/runtime`; and, if a page were # to redirect to itself, the result would be a circular redirect. # # The reason we still bother to build the MkDocs site here is so we get an error (a failed GHA workflow run) # when the site is no longer buildable (e.g. due to someone inadvertently introducing an invalid source - # file, or due to one of the dependencies becoming unavailable). + # file, or due to one of the dependencies becoming unavailable), which can prompt remedial action. # - run: mkdocs build env: From 12332bf373cbdaf0c2bd2dcd13c07b90e322e964 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 17 Jan 2025 19:32:02 -0800 Subject: [PATCH 60/70] Define bash function within same GHA step as its invocations --- .github/workflows/deploy-redirects.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy-redirects.yml b/.github/workflows/deploy-redirects.yml index ae7742ff..4e48aebc 100644 --- a/.github/workflows/deploy-redirects.yml +++ b/.github/workflows/deploy-redirects.yml @@ -20,15 +20,15 @@ jobs: steps: - name: Check out commit uses: actions/checkout@v4 - - name: Define helper bash function + - name: Create website file tree run: | + # Define helper function. make_redirect() { # This function prints the HTML markup for a web page that redirects the client. local url="https://docs.microbiomedata.org/runtime/${1}" echo "Redirecting to ${url}..." - } - - name: Create website file tree - run: | + } + # Create directories. mkdir -p \ _build/html \ From 68277181e3317043941eefe9c1b69edced55291a Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 19 Jan 2025 12:14:03 -0800 Subject: [PATCH 61/70] Refine existing contents of notebook before adding to it --- docs/nb/general_usage.ipynb | 42 ++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/docs/nb/general_usage.ipynb b/docs/nb/general_usage.ipynb index 8e8fcf4a..a0f3e3cb 100644 --- a/docs/nb/general_usage.ipynb +++ b/docs/nb/general_usage.ipynb @@ -15,7 +15,7 @@ "\n", "## Getting help\n", "\n", - "In case you have questions about the contents of this notebook, you can post them as [GitHub issues](https://github.com/microbiomedata/nmdc-runtime/issues/new) in the `microbiomedata/nmdc-runtime` GitHub repository, in which this notebook resides. NMDC team members regularly review open issues. In case you don't have a GitHub account, you can email your questions to the [NMDC Support Team](mailto:support@microbiomedata.org)." + "In case you have questions about the contents of this notebook, you can post them as [GitHub issues](https://github.com/microbiomedata/nmdc-runtime/issues/new) in the `microbiomedata/nmdc-runtime` GitHub repository (that's where this notebook resides). NMDC team members regularly review open issues there. In case you don't have a GitHub account, you can email your questions to the [NMDC Support Team](mailto:support@microbiomedata.org)." ] }, { @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "49fe57573c851cba", "metadata": { "ExecuteTime": { @@ -117,7 +117,7 @@ "source": [ "If the first of those cells outputs the number `200` and the second one outputs a Python dictionary having several keys (including `id` and `title`), you are good to go!\n", "\n", - "> In case those cells did not output those things, here are some troubleshooting tips: (1) check your Internet connection, (2) visit the same URL from the example above, in your web browser, and (3) review the [documentation](https://requests.readthedocs.io/en/latest/) of the `requests` package.\n", + "> In case those cells did not output those things, here are some troubleshooting tips: (1) check your Internet connection, (2) visit the same URL from the example above, in your web browser, (3) review the [documentation](https://requests.readthedocs.io/en/latest/) of the `requests` package, and (4) restart your Jupyter [kernel](https://docs.jupyter.org/en/latest/projects/kernels.html) so it \"becomes aware\" of all recently-installed packages—in this case, the `requests` package.\n", "\n", "Now that you can access _an_ HTTP server, let's access the **NMDC Runtime API**." ] @@ -129,7 +129,7 @@ "source": [ "## 2. Access an NMDC Runtime API endpoint\n", "\n", - "The NMDC Runtime API has a variety of API endpoints that you send HTTP requests to.\n", + "The NMDC Runtime API has a variety of API endpoints that you can send HTTP requests to.\n", "\n", "> The full list of API endpoints is listed in the NMDC Runtime API's [API documentation](https://api.microbiomedata.org/docs).\n", "\n", @@ -183,7 +183,9 @@ "source": [ "Whoa! That's a lot of output. Let's break it down.\n", "\n", - "You can run the following cell to see only its top-level properties:" + "> In the [API documentation](https://api.microbiomedata.org/docs#/find/find_studies_studies_get) for the `/studies` API endpoint, the \"Responses\" section contains an example response from the API endpoint, as well as a generic schema that all of the API endpoint's responses will conform to. You can use both of those things to make sense of the API endpoint's response.\n", + "\n", + "Given that—for this API endpoint—`response.json()` returns a Python dictionary, you can run the following cell to see the dictionary's top-level keys:" ] }, { @@ -206,13 +208,13 @@ "id": "6611146f6253a742", "metadata": {}, "source": [ - "The `meta` property contains data _about the response_, such as pagination parameters and search filter criteria.\n", + "The `meta` item contains data _about the response_, such as pagination parameters and search filter criteria.\n", "\n", - "The `results` property contains the requested data—in this case, a list of studies.\n", + "The `results` item contains the requested data—in this case, a list of studies.\n", "\n", - "You can ignore the `group_by` property. According to the NMDC Runtime API's API documentation, `group_by` is not implemented yet.\n", + "You can ignore the `group_by` item. According to the NMDC Runtime API's API documentation, `group_by` is not implemented yet.\n", "\n", - "Let's display just the `meta` property:" + "Let's examine the `meta` item:" ] }, { @@ -235,9 +237,9 @@ "id": "5eb4a11914ab2971", "metadata": {}, "source": [ - "According to the `meta` property, there are 32 studies in the database.\n", + "According to the `meta` item, there are 32 studies in the database.\n", "\n", - "> Note: At the time of this writing, there are 32. When you run the cell, you may see a different number. The database is constantly changing.\n", + "> Note: At the time of this writing, there are 32. When you run the cell, you may see a different number as the database is constantly changing.\n", "\n", "Let's count the studies we received in the `results` list:" ] @@ -262,7 +264,7 @@ "id": "e9b22bd4ae4f88b9", "metadata": {}, "source": [ - "The `results` list contains only 25 studies—as opposed to 32. That's because this endpoint uses [pagination](https://en.wikipedia.org/wiki/Pagination#In_Database), and the default page size happens to be 25.\n", + "The `results` list contains only _25_ studies—as opposed to _32_. That's because this endpoint uses [pagination](https://en.wikipedia.org/wiki/Pagination#In_Database), and the default page size happens to be 25.\n", "\n", "You can customize the page size like this:" ] @@ -295,7 +297,7 @@ "\n", "You can use the `per_page` parameter to customize the number of items you want to receive per HTTP response.\n", "\n", - "You can use other parameters to customize the response in other ways, too. For example, you can run the following cell to request only studies whose `ecosystem_category` value is `Aquatic`, and request that the API response contain at most two studies." + "You can use other parameters to customize the response in other ways, too. For example, you can run the following cell to request only studies whose `ecosystem_category` value is `Aquatic`, request that the API response contain at most 2 studies, and request that they be sorted by name." ] }, { @@ -347,9 +349,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" } }, "nbformat": 5, From 1f8d757717c7430d7444517482fad3c3e1abd1d3 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 19 Jan 2025 18:52:55 -0800 Subject: [PATCH 62/70] Add section about accessing private API endpoints --- docs/nb/general_usage.ipynb | 264 +++++++++++++++++++++++++++++++++++- 1 file changed, 257 insertions(+), 7 deletions(-) diff --git a/docs/nb/general_usage.ipynb b/docs/nb/general_usage.ipynb index a0f3e3cb..08f05729 100644 --- a/docs/nb/general_usage.ipynb +++ b/docs/nb/general_usage.ipynb @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "62d28232a8221431", "metadata": { "ExecuteTime": { @@ -117,7 +117,7 @@ "source": [ "If the first of those cells outputs the number `200` and the second one outputs a Python dictionary having several keys (including `id` and `title`), you are good to go!\n", "\n", - "> In case those cells did not output those things, here are some troubleshooting tips: (1) check your Internet connection, (2) visit the same URL from the example above, in your web browser, (3) review the [documentation](https://requests.readthedocs.io/en/latest/) of the `requests` package, and (4) restart your Jupyter [kernel](https://docs.jupyter.org/en/latest/projects/kernels.html) so it \"becomes aware\" of all recently-installed packages—in this case, the `requests` package.\n", + "> 💡 **Tip:** In case those cells did not output those things, here are some troubleshooting tips: (1) check your Internet connection, (2) visit the same URL from the example above, in your web browser, (3) review the [documentation](https://requests.readthedocs.io/en/latest/) of the `requests` package, and (4) restart your Jupyter [kernel](https://docs.jupyter.org/en/latest/projects/kernels.html) so it \"becomes aware\" of all recently-installed packages—in this case, the `requests` package.\n", "\n", "Now that you can access _an_ HTTP server, let's access the **NMDC Runtime API**." ] @@ -131,7 +131,7 @@ "\n", "The NMDC Runtime API has a variety of API endpoints that you can send HTTP requests to.\n", "\n", - "> The full list of API endpoints is listed in the NMDC Runtime API's [API documentation](https://api.microbiomedata.org/docs).\n", + "> 💡 **Tip:** The full list of API endpoints is listed in the NMDC Runtime API's [API documentation](https://api.microbiomedata.org/docs).\n", "\n", "One of the API endpoints that I like to send HTTP requests to is `/studies`. That API endpoint responds with a list of all the studies that exist in the NMDC database!\n", "\n", @@ -183,7 +183,7 @@ "source": [ "Whoa! That's a lot of output. Let's break it down.\n", "\n", - "> In the [API documentation](https://api.microbiomedata.org/docs#/find/find_studies_studies_get) for the `/studies` API endpoint, the \"Responses\" section contains an example response from the API endpoint, as well as a generic schema that all of the API endpoint's responses will conform to. You can use both of those things to make sense of the API endpoint's response.\n", + "> 💡 **Tip:** In the [API documentation](https://api.microbiomedata.org/docs#/find/find_studies_studies_get) for the `/studies` API endpoint, the \"Responses\" section contains an example response from the API endpoint, as well as a generic schema that all of the API endpoint's responses will conform to. You can use both of those things to make sense of the API endpoint's response.\n", "\n", "Given that—for this API endpoint—`response.json()` returns a Python dictionary, you can run the following cell to see the dictionary's top-level keys:" ] @@ -335,14 +335,264 @@ "id": "92c00d38", "metadata": {}, "source": [ - "## 3. Access a _protected_ NMDC Runtime API endpoint\n", + "## 3. Access a _private_ NMDC Runtime API endpoint\n", + "\n", + "In the previous section, you accessed an API endpoint that **did not require authentication**. In this tutorial, I'll refer to such an API endpoint as a \"public\" API endpoint. Indeed, most of the NMDC Runtime's API endpoints are \"public.\"\n", + "\n", + "However, there are some API endpoints that **do require authentication**; for example, API endpoints that can be used to modify existing data or perform resource-intensive operations. In this tutorial, I'll refer to those API endpoints as \"private\" API endpoints.\n", + "\n", + "> 💡 **Tip:** You can tell whether an API endpoint is \"public\" or \"private\" by checking whether there is a padlock icon next to it in the [API documentation](https://api.microbiomedata.org/docs). If there is, the API endpoint is \"private\" (i.e., accessing it requires authentication); otherwise, it is \"public\" (i.e., accessing it does _not_ require authentication).\n", + "\n", + "In this section, I'll show you how you can access a \"private\" API endpoint." + ] + }, + { + "cell_type": "markdown", + "id": "23fd4d59", + "metadata": {}, + "source": [ + "The first step is to tell this notebook what your NMDC Runtime username and password are. You can do that by running the cell below, which will propmt you for input:\n", + "\n", + "> ⚠️ **Warning:** Storing real usernames and passwords directly in a Python notebook—or in any other form of source code—increases the risk that they be accidentally committed to a source code repository. That's why I'm using Python's [getpass](https://docs.python.org/3/library/getpass.html) module here, instead of suggesting that you type your username and password directly into the cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b7b529", + "metadata": {}, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "\n", + "# Prompt the user for their NMDC Runtime username and password.\n", + "username = getpass(prompt=\"NMDC Runtime username: \")\n", + "password = getpass(prompt=\"NMDC Runtime password: \")\n", + "\n", + "# Display string lengths as a \"sanity test.\"\n", + "print(f\"Username length: {len(username)}\")\n", + "print(f\"Password length: {len(password)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "915e036f", + "metadata": {}, + "source": [ + "Now that the `username` and `password` variables contain your NMDC Runtime username and password, you can exchange those for an NMDC Runtime API **access token**. You can do that by running this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "827abd1c", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " \"https://api.microbiomedata.org/token\",\n", + " data={\n", + " \"grant_type\": \"password\",\n", + " \"username\": username,\n", + " \"password\": password,\n", + " },\n", + ")\n", + "\n", + "# Print the response payload, which includes the access token.\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "ddeba883", + "metadata": {}, + "source": [ + "The API response will contain several properties (you can list them via `response.json().keys()`). One of them is named `access_token`.\n", + "\n", + "The `access_token` property contains a string you can use to access \"private\" API endpoints. That string is the access token (hence, the name of the property).\n", + "\n", + "I recommend storing that access token in a Python variable for future reference. You can do that by running this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7b81266", + "metadata": {}, + "outputs": [], + "source": [ + "access_token = response.json()[\"access_token\"]\n", + "\n", + "print(f\"Access token: {access_token}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b53e5253", + "metadata": {}, + "source": [ + "Now that you have an access token, you can use it to access a \"private\" API endpoint.\n", + "\n", + "One of the \"privacy\" API endpoints I like to run is called `/queries:run`. I like it because I can use it to query the database in more sophisticated ways than some of the public API endpoints allow.\n", + "\n", + "> 💡 **Tip:** As with _all_ API endpoints, you can learn about this one by reading the NMDC Runtime API's [API documentation](https://api.microbiomedata.org/docs).\n", + "\n", + "Let's use the \"private\" `/queries:run` API endpoint to find all the studies whose `ecosystem_category` value is `Aquatic` (just like we did with the \"public\" `/studies` API endpoint earlier)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f8e5cd6", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " \"https://api.microbiomedata.org/queries:run\",\n", + " headers={\n", + " \"Authorization\": f\"Bearer {access_token}\",\n", + " },\n", + " json={\n", + " \"find\": \"study_set\",\n", + " \"filter\": {\"ecosystem_category\": \"Aquatic\"},\n", + " },\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "c6a531da", + "metadata": {}, + "source": [ + "The API response's shape is different from that of the `/studies` API endpoint. Let's explore this API response. You can get a list of its top-level properties by running the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b2c1a61", + "metadata": {}, + "outputs": [], + "source": [ + "response.json().keys()" + ] + }, + { + "cell_type": "markdown", + "id": "ef161477", + "metadata": {}, + "source": [ + "In the case of the `/queries:run` API endpoint, the results are in the `cursor` property. Let's dig into that property. You can see its properties by running the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9a1f073", + "metadata": {}, + "outputs": [], + "source": [ + "response.json()[\"cursor\"].keys()" + ] + }, + { + "cell_type": "markdown", + "id": "875ea7f3", + "metadata": {}, + "source": [ + "The studies are in the `firstBatch` property. You can count them by running this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9afbae98", + "metadata": {}, + "outputs": [], + "source": [ + "len(response.json()[\"cursor\"][\"firstBatch\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1df499df", + "metadata": {}, + "source": [ + "You can print their names by running this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d0ecffc", + "metadata": {}, + "outputs": [], + "source": [ + "for study in response.json()[\"cursor\"][\"firstBatch\"]:\n", + " print(study[\"name\"])" + ] + }, + { + "cell_type": "markdown", + "id": "0d6d353a", + "metadata": {}, + "source": [ + "**Congratulations!** You've used a Python notebook to retrieve data residing in the NMDC database, via a \"private\" NMDC Runtime API endpoint. 🎉" + ] + }, + { + "cell_type": "markdown", + "id": "9d652ee2", + "metadata": {}, + "source": [ + "Finally, let's see what would have happened it you had visited the same API endpoint _without_ including your access token in the API request. You can do that by running this cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c5446ea", + "metadata": {}, + "outputs": [], + "source": [ + "response = requests.post(\n", + " \"https://api.microbiomedata.org/queries:run\",\n", + " json={\n", + " \"find\": \"study_set\",\n", + " \"filter\": {\"ecosystem_category\": \"Aquatic\"},\n", + " },\n", + ")\n", + "\n", + "response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "e3102339", + "metadata": {}, + "source": [ + "Since this is a \"private\" API endpoint; when you access it without specifying an access token, it responds with the message, \"`Could not validate credentials`\" (in this case, we didn't give it any credentials to validate)." + ] + }, + { + "cell_type": "markdown", + "id": "f8385665", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this tutorial, I showed you how you could access a \"public\" API endpoint, how you could obtain an access token, and how you could use that access token to access a \"private\" API endpoint. I also showed you how you could explore a few API responses. Finally, I told you where you could find the [API documentation](https://api.microbiomedata.org/docs), which contains a list of all API endpoints.\n", + "\n", + "Thank you for going through this tutorial. You can continue to explore the [API documentation](https://api.microbiomedata.org/docs) and send API requests to API endpoints you find interesting.\n", "\n", - "TODO" + "We'd love to know what you think about the NMDC Runtime API and about this tutorial. You can tell us what you think by creating a [GitHub issue](https://github.com/microbiomedata/nmdc-runtime/issues/new) in the `microbiomedata/nmdc-runtime` GitHub repository or sending the NMDC Support Team an email at [support@microbiomedata.org](mailto:support@microbiomedata.org)." ] }, { "cell_type": "markdown", - "id": "5e75dc49", + "id": "f3272d9f", "metadata": {}, "source": [] } From f7f4890eab6010155de46dea3b0d025ebc70bcd9 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 19 Jan 2025 19:03:56 -0800 Subject: [PATCH 63/70] Change `nbformat` version (try to resolve `NBFormatError`) --- docs/nb/general_usage.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/nb/general_usage.ipynb b/docs/nb/general_usage.ipynb index 08f05729..9b192b01 100644 --- a/docs/nb/general_usage.ipynb +++ b/docs/nb/general_usage.ipynb @@ -616,6 +616,6 @@ "version": "3.9.6" } }, - "nbformat": 5, + "nbformat": 4, "nbformat_minor": 9 } From fa6f47263f698e26bf7d654e63faca1b9a316daa Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 19 Jan 2025 19:10:16 -0800 Subject: [PATCH 64/70] Populate `nbformat` property using value provided by VS Code --- docs/nb/general_usage.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/nb/general_usage.ipynb b/docs/nb/general_usage.ipynb index 9b192b01..36d1e833 100644 --- a/docs/nb/general_usage.ipynb +++ b/docs/nb/general_usage.ipynb @@ -617,5 +617,5 @@ } }, "nbformat": 4, - "nbformat_minor": 9 + "nbformat_minor": 2 } From 69207bd38122461dbbf7fe84e05a32aab960bb8e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 19 Jan 2025 19:59:49 -0800 Subject: [PATCH 65/70] Refine introductory message in notebook --- docs/nb/general_usage.ipynb | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/nb/general_usage.ipynb b/docs/nb/general_usage.ipynb index 36d1e833..210c7f22 100644 --- a/docs/nb/general_usage.ipynb +++ b/docs/nb/general_usage.ipynb @@ -11,7 +11,10 @@ "\n", "In this tutorial, I'll show you how you can use a Python notebook to interact with the NMDC Runtime API.\n", "\n", - "Specifically, I'll show you how you can use a Python notebook to (a) submit HTTP requests, (b) parse HTTP responses, and (c) authenticate an HTTP client.\n", + "By the end of this tutorial, you will have:\n", + "- Accessed several NMDC Runtime API endpoints\n", + "- Learned how you can discover additional NMDC Runtime API endpoints\n", + "- Learned how you can contact NMDC team members for help\n", "\n", "## Getting help\n", "\n", @@ -49,12 +52,14 @@ "id": "c2d17a26f190dcc0", "metadata": {}, "source": [ - "Now that the `requests` package is installed, you can use it to send HTTP requests to HTTP servers. For example, you can run the following cell to submit an HTTP GET request to an example HTTP server:" + "Now that the `requests` package is installed, you can use it to send HTTP requests to HTTP servers. For example, you can run the following cell to submit an HTTP GET request to an example HTTP server:\n", + "\n", + "> Note: This example HTTP server is not maintained by the NMDC team. It is a third-party HTTP server you can use to confirm your HTTP client works, independently of the NMDC Runtime." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "id": "62d28232a8221431", "metadata": { "ExecuteTime": { @@ -135,7 +140,9 @@ "\n", "One of the API endpoints that I like to send HTTP requests to is `/studies`. That API endpoint responds with a list of all the studies that exist in the NMDC database!\n", "\n", - "You can run the following cell to send an HTTP GET request to that API endpoint:" + "You can run the following cell to send an HTTP GET request to that API endpoint:\n", + "\n", + "> Note: The HTTP response the server sends back will be stored in the `response` variable." ] }, { @@ -617,5 +624,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } From c13ccb0974dc3ce3a4641049454a661c6cc4cdc3 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Sun, 19 Jan 2025 20:18:36 -0800 Subject: [PATCH 66/70] Add Python usage notebook to MkDocs navigation sidebar --- docs/nb/{general_usage.ipynb => api_access_via_python.ipynb} | 4 ++-- mkdocs.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) rename docs/nb/{general_usage.ipynb => api_access_via_python.ipynb} (99%) diff --git a/docs/nb/general_usage.ipynb b/docs/nb/api_access_via_python.ipynb similarity index 99% rename from docs/nb/general_usage.ipynb rename to docs/nb/api_access_via_python.ipynb index 210c7f22..77c774aa 100644 --- a/docs/nb/general_usage.ipynb +++ b/docs/nb/api_access_via_python.ipynb @@ -5,11 +5,11 @@ "id": "8eadc33e15fcf211", "metadata": {}, "source": [ - "# Using the NMDC Runtime API\n", + "# Use Python to access the NMDC Runtime API\n", "\n", "## Introduction\n", "\n", - "In this tutorial, I'll show you how you can use a Python notebook to interact with the NMDC Runtime API.\n", + "In this tutorial, I'll show you how you can use Python to interact with the NMDC Runtime API.\n", "\n", "By the end of this tutorial, you will have:\n", "- Accessed several NMDC Runtime API endpoints\n", diff --git a/mkdocs.yml b/mkdocs.yml index b077903f..1cc6006f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,6 +13,7 @@ nav: - tutorials/metadata-in.md - tutorials/translators.md - tutorials/exporters.md + - nb/api_access_via_python.ipynb - How-to guides: - howto-guides/author-changesheets.md - howto-guides/create-triggers.md From 6bb203e030855b8b3cd0e32dc0df4981fb6f16ea Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 20 Jan 2025 11:39:28 -0800 Subject: [PATCH 67/70] Add comment regarding making redirects in place of documentation pages --- .github/workflows/deploy-redirects.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/deploy-redirects.yml b/.github/workflows/deploy-redirects.yml index 4e48aebc..f1bc82c9 100644 --- a/.github/workflows/deploy-redirects.yml +++ b/.github/workflows/deploy-redirects.yml @@ -39,6 +39,15 @@ jobs: _build/html/nb # Create HTML files containing redirects. + # + # Note: These HTML files will be accessible at the same URLs at which _original_ Runtime documentation pages + # had been accessible (on the https://microbiomedata.github.io/nmdc-runtime/ website) prior to the + # launch of the new, NMDC-wide documentation website (i.e., https://docs.microbiomedata.org). + # + # For documents added to the Runtime repo's `docs/` directory _after_ the launch of the latter website, + # creating redirects for those documents' URLs is unnecessary, since there was nothing at those URLs + # before (presumably, nobody has expects anything other than an "HTTP 404 Not Found" error at them). + # cd _build/html make_redirect index.html > index.html make_redirect admin.html > admin.html From c7ee7327a0e37c1adffc1690d410202270d6f04b Mon Sep 17 00:00:00 2001 From: eecavanna Date: Mon, 20 Jan 2025 11:47:01 -0800 Subject: [PATCH 68/70] Fix typos and clear notebook outputs --- docs/nb/api_access_via_python.ipynb | 205 ++++++++++------------------ 1 file changed, 75 insertions(+), 130 deletions(-) diff --git a/docs/nb/api_access_via_python.ipynb b/docs/nb/api_access_via_python.ipynb index 77c774aa..e85f7db6 100644 --- a/docs/nb/api_access_via_python.ipynb +++ b/docs/nb/api_access_via_python.ipynb @@ -34,18 +34,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "49fe57573c851cba", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T06:36:26.872098Z", - "start_time": "2025-01-17T06:36:25.703022Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "%pip install requests" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -59,21 +54,16 @@ }, { "cell_type": "code", - "execution_count": 55, "id": "62d28232a8221431", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T06:46:09.787642Z", - "start_time": "2025-01-17T06:46:09.613677Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "import requests\n", "\n", "# Submit an HTTP GET request to an example HTTP server.\n", "response = requests.get(\"https://jsonplaceholder.typicode.com/posts/1\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -85,35 +75,25 @@ }, { "cell_type": "code", - "execution_count": 7, "id": "ae560ed292755cd2", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T06:46:11.147029Z", - "start_time": "2025-01-17T06:46:11.143317Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "# Get the HTTP status code from the response.\n", "response.status_code" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 8, "id": "870810045483f31f", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T06:46:12.367741Z", - "start_time": "2025-01-17T06:46:12.361007Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "# Parse the response as a JSON string.\n", "response.json()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -147,18 +127,13 @@ }, { "cell_type": "code", - "execution_count": 9, "id": "8cc19fe2047322a8", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T06:57:22.368322Z", - "start_time": "2025-01-17T06:57:22.048361Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "response = requests.get(\"https://api.microbiomedata.org/studies\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -170,18 +145,13 @@ }, { "cell_type": "code", - "execution_count": 11, "id": "e9bf89ac847d5383", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T06:58:31.537245Z", - "start_time": "2025-01-17T06:58:31.449579Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "response.json()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -197,18 +167,13 @@ }, { "cell_type": "code", - "execution_count": 16, "id": "cdc717a66bfc3136", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T07:05:21.982850Z", - "start_time": "2025-01-17T07:05:21.972839Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "response.json().keys()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -219,25 +184,20 @@ "\n", "The `results` item contains the requested data—in this case, a list of studies.\n", "\n", - "You can ignore the `group_by` item. According to the NMDC Runtime API's API documentation, `group_by` is not implemented yet.\n", + "You can ignore the `group_by` item. According to the NMDC Runtime's API documentation, `group_by` is not implemented yet.\n", "\n", "Let's examine the `meta` item:" ] }, { "cell_type": "code", - "execution_count": 17, "id": "4f30bc5dee894252", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T07:11:23.271756Z", - "start_time": "2025-01-17T07:11:23.249253Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "response.json()[\"meta\"]" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -253,18 +213,13 @@ }, { "cell_type": "code", - "execution_count": 18, "id": "c5d6b38c5888050a", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T07:13:27.936952Z", - "start_time": "2025-01-17T07:13:27.930606Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "len(response.json()[\"results\"])" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -278,22 +233,17 @@ }, { "cell_type": "code", - "execution_count": 21, "id": "176887bd5a7c241e", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T07:17:03.432263Z", - "start_time": "2025-01-17T07:17:02.593617Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "# Resend the same HTTP request, but include a higher page size than the default of 25.\n", "response = requests.get(\"https://api.microbiomedata.org/studies?per_page=100\")\n", "\n", "# Count the studies in the `results` list.\n", "len(response.json()[\"results\"])" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -309,15 +259,8 @@ }, { "cell_type": "code", - "execution_count": 29, "id": "e012062c4f4d454d", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-17T07:26:20.476666Z", - "start_time": "2025-01-17T07:26:20.373037Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "response = requests.get(\"https://api.microbiomedata.org/studies?filter=ecosystem_category:Aquatic&per_page=2&sort_by=name\")\n", "\n", @@ -327,7 +270,9 @@ "# Print their names in the order in which they appear in the response.\n", "for study in response.json()[\"results\"]:\n", " print(study[\"name\"])" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -358,17 +303,15 @@ "id": "23fd4d59", "metadata": {}, "source": [ - "The first step is to tell this notebook what your NMDC Runtime username and password are. You can do that by running the cell below, which will propmt you for input:\n", + "The first step is to tell this notebook what your NMDC Runtime username and password are. You can do that by running the cell below, which will prompt you for input:\n", "\n", "> ⚠️ **Warning:** Storing real usernames and passwords directly in a Python notebook—or in any other form of source code—increases the risk that they be accidentally committed to a source code repository. That's why I'm using Python's [getpass](https://docs.python.org/3/library/getpass.html) module here, instead of suggesting that you type your username and password directly into the cell." ] }, { "cell_type": "code", - "execution_count": null, "id": "66b7b529", "metadata": {}, - "outputs": [], "source": [ "from getpass import getpass\n", "\n", @@ -379,7 +322,9 @@ "# Display string lengths as a \"sanity test.\"\n", "print(f\"Username length: {len(username)}\")\n", "print(f\"Password length: {len(password)}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -391,10 +336,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "827abd1c", "metadata": {}, - "outputs": [], "source": [ "response = requests.post(\n", " \"https://api.microbiomedata.org/token\",\n", @@ -407,7 +350,9 @@ "\n", "# Print the response payload, which includes the access token.\n", "response.json()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -423,15 +368,15 @@ }, { "cell_type": "code", - "execution_count": null, "id": "b7b81266", "metadata": {}, - "outputs": [], "source": [ "access_token = response.json()[\"access_token\"]\n", "\n", "print(f\"Access token: {access_token}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -440,19 +385,17 @@ "source": [ "Now that you have an access token, you can use it to access a \"private\" API endpoint.\n", "\n", - "One of the \"privacy\" API endpoints I like to run is called `/queries:run`. I like it because I can use it to query the database in more sophisticated ways than some of the public API endpoints allow.\n", + "One of the \"private\" API endpoints I like to access is called `/queries:run`. I use it to query the NMDC database in more sophisticated ways than some of the \"public\" API endpoints allow.\n", "\n", - "> 💡 **Tip:** As with _all_ API endpoints, you can learn about this one by reading the NMDC Runtime API's [API documentation](https://api.microbiomedata.org/docs).\n", + "> 💡 **Tip:** As with _all_ API endpoints, you can learn about this one by reading the NMDC Runtime's [API documentation](https://api.microbiomedata.org/docs).\n", "\n", "Let's use the \"private\" `/queries:run` API endpoint to find all the studies whose `ecosystem_category` value is `Aquatic` (just like we did with the \"public\" `/studies` API endpoint earlier)." ] }, { "cell_type": "code", - "execution_count": null, "id": "1f8e5cd6", "metadata": {}, - "outputs": [], "source": [ "response = requests.post(\n", " \"https://api.microbiomedata.org/queries:run\",\n", @@ -466,7 +409,9 @@ ")\n", "\n", "response.json()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -478,13 +423,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "7b2c1a61", "metadata": {}, - "outputs": [], "source": [ "response.json().keys()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -496,13 +441,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "e9a1f073", "metadata": {}, - "outputs": [], "source": [ "response.json()[\"cursor\"].keys()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -514,13 +459,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "9afbae98", "metadata": {}, - "outputs": [], "source": [ "len(response.json()[\"cursor\"][\"firstBatch\"])" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -532,14 +477,14 @@ }, { "cell_type": "code", - "execution_count": null, "id": "5d0ecffc", "metadata": {}, - "outputs": [], "source": [ "for study in response.json()[\"cursor\"][\"firstBatch\"]:\n", " print(study[\"name\"])" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -559,10 +504,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "0c5446ea", "metadata": {}, - "outputs": [], "source": [ "response = requests.post(\n", " \"https://api.microbiomedata.org/queries:run\",\n", @@ -573,7 +516,9 @@ ")\n", "\n", "response.json()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", From 26535f8eaa64ab36b50abfdab57198484b957912 Mon Sep 17 00:00:00 2001 From: eecavanna <134325062+eecavanna@users.noreply.github.com> Date: Mon, 20 Jan 2025 12:07:19 -0800 Subject: [PATCH 69/70] Clarify example sentence in PR template --- .github/pull_request_template.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index b0c8634b..792154f3 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,12 +4,12 @@ │ Summarize the changes you made on this branch. This is typically a more │ │ detailed restatement of the PR title. │ │ │ - │ Example: "In this branch, I updated the `/studies/{study_id}` endpoint │ + │ Example: "On this branch, I updated the `/studies/{study_id}` endpoint │ │ so it returns an HTTP 404 response when the specified study │ │ does not exist." │ └─────────────────────────────────────────────────────────────────────────┘--> -In this branch, I... +On this branch, I... ### Details From ab7d9d671b589ddcc6418fe97b74deb2feb4e6c7 Mon Sep 17 00:00:00 2001 From: eecavanna <134325062+eecavanna@users.noreply.github.com> Date: Mon, 20 Jan 2025 12:25:05 -0800 Subject: [PATCH 70/70] Clarify explanation of access token in Python notebook (docs) --- docs/nb/api_access_via_python.ipynb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/nb/api_access_via_python.ipynb b/docs/nb/api_access_via_python.ipynb index e85f7db6..0b529ca9 100644 --- a/docs/nb/api_access_via_python.ipynb +++ b/docs/nb/api_access_via_python.ipynb @@ -359,9 +359,7 @@ "id": "ddeba883", "metadata": {}, "source": [ - "The API response will contain several properties (you can list them via `response.json().keys()`). One of them is named `access_token`.\n", - "\n", - "The `access_token` property contains a string you can use to access \"private\" API endpoints. That string is the access token (hence, the name of the property).\n", + "The API response will contain several properties (you can list them via `response.json().keys()`). One of them is named `access_token`. Its value is an access token; i.e., a string you can use to access \"private\" API endpoints.\n", "\n", "I recommend storing that access token in a Python variable for future reference. You can do that by running this cell:" ]