Skip to content

Commit

Permalink
Merge pull request #738 from microbiomedata/issue-723
Browse files Browse the repository at this point in the history
`/data_objects/study/{study_id}` should take into account `was_informed_by` relationships
  • Loading branch information
sujaypatil96 authored Oct 28, 2024
2 parents e1af4cf + f992e2b commit 9ec7abb
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 25 deletions.
100 changes: 75 additions & 25 deletions nmdc_runtime/api/endpoints/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from starlette.responses import HTMLResponse
from toolz import merge, assoc_in

from nmdc_schema.get_nmdc_view import ViewGetter
from nmdc_runtime.api.core.util import raise404_if_none
from nmdc_runtime.api.db.mongo import (
get_mongo_db,
Expand Down Expand Up @@ -160,42 +161,91 @@ def find_data_objects_for_study(
biosamples = mdb.biosample_set.find({"associated_studies": study["id"]}, ["id"])
biosample_ids = [biosample["id"] for biosample in biosamples]

# SchemaView interface to NMDC Schema
nmdc_view = ViewGetter()
nmdc_sv = nmdc_view.get_view()
dg_descendants = nmdc_sv.class_descendants("DataGeneration")

def collect_data_objects(doc_ids, collected_objects, unique_ids):
"""Helper function to collect data objects from `has_input` and `has_output` references."""
for doc_id in doc_ids:
if (
get_classname_from_typecode(doc_id) == "DataObject"
and doc_id not in unique_ids
):
data_obj = mdb.data_object_set.find_one({"id": doc_id})
if data_obj:
collected_objects.append(strip_oid(data_obj))
unique_ids.add(doc_id)

# Another way in which DataObjects can be related to Biosamples is through the
# `was_informed_by` key/slot. We need to link records from the `workflow_execution_set`
# collection that are "informed" by the same DataGeneration records that created
# the outputs above. Then we need to get additional DataObject records that are
# created by this linkage.
def process_informed_by_docs(doc, collected_objects, unique_ids):
"""Process documents linked by `was_informed_by` and collect relevant data objects."""
informed_by_docs = mdb.workflow_execution_set.find(
{"was_informed_by": doc["id"]}
)
for informed_doc in informed_by_docs:
collect_data_objects(
informed_doc.get("has_input", []), collected_objects, unique_ids
)
collect_data_objects(
informed_doc.get("has_output", []), collected_objects, unique_ids
)

biosample_data_objects = []

for biosample_id in biosample_ids:
current_ids = [biosample_id]
collected_data_objects = []

unique_ids = set()

# Iterate over records in the `alldocs` collection. Look for
# records that have the given biosample_id as value on the
# `has_input` key/slot. The retrieved documents might also have a
# `has_output` key/slot associated with them. Get the value of the
# `has_output` key and check if it's type is `nmdc:DataObject`. If
# it's not, repeat the process till it is.
while current_ids:
new_current_ids = []
for current_id in current_ids:
query = {"has_input": current_id}
document = mdb.alldocs.find_one(query)

if not document:
continue

has_output = document.get("has_output")
if not has_output:
continue

for output_id in has_output:
if get_classname_from_typecode(output_id) == "DataObject":
data_object_doc = mdb.data_object_set.find_one(
{"id": output_id}
# Query to find all documents with current_id as the value on
# `has_input` slot
for doc in mdb.alldocs.find({"has_input": current_id}):
has_output = doc.get("has_output", [])

# Process `DataGeneration` type documents linked by `was_informed_by`
if not has_output and any(
t in dg_descendants for t in doc.get("type", [])
):
process_informed_by_docs(
doc, collected_data_objects, unique_ids
)
continue

collect_data_objects(has_output, collected_data_objects, unique_ids)
new_current_ids.extend(
op
for op in has_output
if get_classname_from_typecode(op) != "DataObject"
)

if any(t in dg_descendants for t in doc.get("type", [])):
process_informed_by_docs(
doc, collected_data_objects, unique_ids
)
if data_object_doc:
collected_data_objects.append(strip_oid(data_object_doc))
else:
new_current_ids.append(output_id)

current_ids = new_current_ids

if collected_data_objects:
biosample_data_objects.append(
{
"biosample_id": biosample_id,
"data_object_set": collected_data_objects,
}
)
result = {
"biosample_id": biosample_id,
"data_objects": collected_data_objects,
}
biosample_data_objects.append(result)

return biosample_data_objects

Expand Down
1 change: 1 addition & 0 deletions nmdc_runtime/site/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,7 @@ def materialize_alldocs(context) -> int:
# /data_objects/study/{study_id} endpoint
mdb.alldocs.create_index("has_input")
mdb.alldocs.create_index("has_output")
mdb.alldocs.create_index("was_informed_by")
context.log.info(
f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
)
Expand Down

0 comments on commit 9ec7abb

Please sign in to comment.