Skip to content

Commit

Permalink
Merge pull request #750 from microbiomedata/749-migrations-update-mig…
Browse files Browse the repository at this point in the history
…ration-notebook-so-database-name-is-configurable-not-hard-coded-to-nmdc

Migrations: Update notebook to perform LinkML validation and allow database name configuration
  • Loading branch information
eecavanna authored Nov 2, 2024
2 parents 14d8ded + 4844c4b commit 6463463
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 72 deletions.
2 changes: 2 additions & 0 deletions demo/metadata_migration/notebooks/.notebook.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@ ORIGIN_MONGO_HOST="__REPLACE_ME__"
ORIGIN_MONGO_PORT="__REPLACE_ME__"
ORIGIN_MONGO_USERNAME="__REPLACE_ME__"
ORIGIN_MONGO_PASSWORD="__REPLACE_ME__"
ORIGIN_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc"

# Connection parameters for the Transformer Mongo server (typically a local server).
TRANSFORMER_MONGO_HOST="__REPLACE_ME__"
TRANSFORMER_MONGO_PORT="__REPLACE_ME__"
TRANSFORMER_MONGO_USERNAME="__REPLACE_ME__"
TRANSFORMER_MONGO_PASSWORD="__REPLACE_ME__"
TRANSFORMER_MONGO_DATABASE_NAME="__REPLACE_ME__" # e.g. "nmdc_transformer"
65 changes: 65 additions & 0 deletions demo/metadata_migration/notebooks/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict, Optional, List
import logging
from datetime import datetime
from functools import cache

from dotenv import dotenv_values
from linkml_runtime import SchemaView
Expand Down Expand Up @@ -78,11 +79,27 @@ def parse_and_validate_notebook_config_file(
origin_mongo_port = notebook_config["ORIGIN_MONGO_PORT"]
origin_mongo_username = notebook_config["ORIGIN_MONGO_USERNAME"]
origin_mongo_password = notebook_config["ORIGIN_MONGO_PASSWORD"]
origin_mongo_database_name = notebook_config["ORIGIN_MONGO_DATABASE_NAME"]

transformer_mongo_host = notebook_config["TRANSFORMER_MONGO_HOST"]
transformer_mongo_port = notebook_config["TRANSFORMER_MONGO_PORT"]
transformer_mongo_username = notebook_config["TRANSFORMER_MONGO_USERNAME"]
transformer_mongo_password = notebook_config["TRANSFORMER_MONGO_PASSWORD"]
transformer_mongo_database_name = notebook_config["TRANSFORMER_MONGO_DATABASE_NAME"]

# Validate the database names.
if origin_mongo_database_name.strip() == "":
raise ValueError(f"Origin database name cannot be empty")
if transformer_mongo_database_name.strip() == "":
raise ValueError(f"Transformer database name cannot be empty")
if all([
origin_mongo_host == transformer_mongo_host,
origin_mongo_port == transformer_mongo_port,
origin_mongo_database_name == transformer_mongo_database_name,
]):
# Note: We don't allow the use of the origin database as the transformer,
# because that would prevent us from easily aborting the migration.
raise ValueError(f"The origin and transformer cannot both be the same database")

return dict(
origin_dump_folder_path=origin_dump_folder_path,
Expand All @@ -94,10 +111,12 @@ def parse_and_validate_notebook_config_file(
origin_mongo_port=origin_mongo_port,
origin_mongo_username=origin_mongo_username,
origin_mongo_password=origin_mongo_password,
origin_mongo_database_name=origin_mongo_database_name,
transformer_mongo_host=transformer_mongo_host,
transformer_mongo_port=transformer_mongo_port,
transformer_mongo_username=transformer_mongo_username,
transformer_mongo_password=transformer_mongo_password,
transformer_mongo_database_name=transformer_mongo_database_name,
)

def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None:
Expand All @@ -114,10 +133,12 @@ def __init__(self, notebook_config_file_path: str = "./.notebook.env") -> None:
self.origin_mongo_port = notebook_config["origin_mongo_port"]
self.origin_mongo_username = notebook_config["origin_mongo_username"]
self.origin_mongo_password = notebook_config["origin_mongo_password"]
self.origin_mongo_database_name = notebook_config["origin_mongo_database_name"]
self.transformer_mongo_host = notebook_config["transformer_mongo_host"]
self.transformer_mongo_port = notebook_config["transformer_mongo_port"]
self.transformer_mongo_username = notebook_config["transformer_mongo_username"]
self.transformer_mongo_password = notebook_config["transformer_mongo_password"]
self.transformer_mongo_database_name = notebook_config["transformer_mongo_database_name"]


def setup_logger(
Expand Down Expand Up @@ -154,6 +175,9 @@ def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]:
Returns the names of the slots of the `Database` class that describe database collections.
:param schema_view: A `SchemaView` instance
Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py
with permission from its author.
"""
collection_names = []

Expand All @@ -170,3 +194,44 @@ def get_collection_names_from_schema(schema_view: SchemaView) -> List[str]:
collection_names = list(set(collection_names))

return collection_names


@cache # memoizes the decorated function
def translate_class_uri_into_schema_class_name(schema_view: SchemaView, class_uri: str) -> Optional[str]:
r"""
Returns the name of the schema class that has the specified value as its `class_uri`.
Example: "nmdc:Biosample" (a `class_uri` value) -> "Biosample" (a class name)
References:
- https://linkml.io/linkml/developers/schemaview.html#linkml_runtime.utils.schemaview.SchemaView.all_classes
- https://linkml.io/linkml/code/metamodel.html#linkml_runtime.linkml_model.meta.ClassDefinition.class_uri
Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py
with permission from its author.
"""
schema_class_name = None
all_class_definitions_in_schema = schema_view.all_classes()
for class_name, class_definition in all_class_definitions_in_schema.items():
if class_definition.class_uri == class_uri:
schema_class_name = class_definition.name
break
return schema_class_name


def derive_schema_class_name_from_document(schema_view: SchemaView, document: dict) -> Optional[str]:
r"""
Returns the name of the schema class, if any, of which the specified document claims to represent an instance.
This function is written under the assumption that the document has a `type` field whose value is the `class_uri`
belonging to the schema class of which the document represents an instance. Slot definition for such a field:
https://github.com/microbiomedata/berkeley-schema-fy24/blob/fc2d9600/src/schema/basic_slots.yaml#L420-L436
Source: This function was copied from https://github.com/microbiomedata/refscan/blob/main/refscan/lib/helpers.py
with permission from its author.
"""
schema_class_name = None
if "type" in document and isinstance(document["type"], str):
class_uri = document["type"]
schema_class_name = translate_class_uri_into_schema_class_name(schema_view, class_uri)
return schema_class_name
Loading

0 comments on commit 6463463

Please sign in to comment.