Skip to content

Commit

Permalink
Merge pull request #724 from microbiomedata/issue-721
Browse files Browse the repository at this point in the history
NCBI XML Export pipeline migration
  • Loading branch information
sujaypatil96 authored Oct 10, 2024
2 parents bcdf8f3 + 4a2292f commit 16f2523
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 65 deletions.
54 changes: 29 additions & 25 deletions nmdc_runtime/site/export/ncbi_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def set_fastq(
biosample_data_objects: list,
bioproject_id: str,
org: str,
nmdc_omics_processing: list,
nmdc_nucleotide_sequencing: list,
nmdc_biosamples: list,
nmdc_library_preparation: list,
):
Expand All @@ -294,10 +294,10 @@ def set_fastq(
for entry in biosample_data_objects:
fastq_files = []
biosample_ids = []
omics_processing_ids = {}
nucleotide_sequencing_ids = {}
lib_prep_protocol_names = {}
instrument_name = ""
omics_type = ""
analyte_category = ""
library_name = ""

for biosample_id, data_objects in entry.items():
Expand All @@ -308,16 +308,16 @@ def set_fastq(
file_path = os.path.basename(url.path)
fastq_files.append(file_path)

for omprc_dict in nmdc_omics_processing:
if biosample_id in omprc_dict:
for omprc in omprc_dict[biosample_id]:
omics_processing_ids[biosample_id] = omprc.get("id", "")
instrument_name = omprc.get("instrument_name", "")
omics_type = (
omprc.get("omics_type", {})
.get("has_raw_value", "")
.lower()
for ntseq_dict in nmdc_nucleotide_sequencing:
if biosample_id in ntseq_dict:
for ntseq in ntseq_dict[biosample_id]:
nucleotide_sequencing_ids[biosample_id] = ntseq.get(
"id", ""
)
# Currently, we are making the assumption that only one instrument
# is used to sequence a Biosample
instrument_name = ntseq.get("instrument_used", "")[0]
analyte_category = ntseq.get("analyte_category", "")
library_name = bsm_id_name_dict.get(biosample_id, "")

for lib_prep_dict in nmdc_library_preparation:
Expand Down Expand Up @@ -395,7 +395,7 @@ def set_fastq(
)
)

if omics_type == "metagenome":
if analyte_category == "metagenome":
sra_attributes.append(
self.set_element(
"Attribute", "WGS", {"name": "library_strategy"}
Expand All @@ -411,8 +411,7 @@ def set_fastq(
"Attribute", "RANDOM", {"name": "library_selection"}
)
)

if omics_type == "metatranscriptome":
elif analyte_category == "metatranscriptome":
sra_attributes.append(
self.set_element(
"Attribute",
Expand Down Expand Up @@ -467,7 +466,10 @@ def set_fastq(
)
)

for biosample_id, omics_processing_id in omics_processing_ids.items():
for (
biosample_id,
omics_processing_id,
) in nucleotide_sequencing_ids.items():
identifier_element = self.set_element(
"Identifier",
children=[
Expand Down Expand Up @@ -496,20 +498,22 @@ def set_fastq(
def get_submission_xml(
self,
biosamples_list: list,
biosample_omics_processing_list: list,
biosample_nucleotide_sequencing_list: list,
biosample_data_objects_list: list,
biosample_library_preparation_list: list,
):
data_type = None
ncbi_project_id = None
for bsm_omprc in biosample_omics_processing_list:
for _, omprc_list in bsm_omprc.items():
for omprc in omprc_list:
if "omics_type" in omprc:
data_type = handle_text_value(omprc["omics_type"]).capitalize()
for bsm_ntseq in biosample_nucleotide_sequencing_list:
for _, ntseq_list in bsm_ntseq.items():
for ntseq in ntseq_list:
if "analyte_category" in ntseq:
data_type = handle_string_value(
ntseq["analyte_category"]
).capitalize()

if "ncbi_project_name" in omprc:
ncbi_project_id = omprc["ncbi_project_name"]
if "ncbi_project_name" in ntseq:
ncbi_project_id = ntseq["ncbi_project_name"]

self.set_description(
email=self.nmdc_pi_email,
Expand Down Expand Up @@ -538,7 +542,7 @@ def get_submission_xml(
biosample_data_objects=biosample_data_objects_list,
bioproject_id=ncbi_project_id,
org=self.ncbi_submission_metadata.get("organization", ""),
nmdc_omics_processing=biosample_omics_processing_list,
nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
nmdc_biosamples=biosamples_list,
nmdc_library_preparation=biosample_library_preparation_list,
)
Expand Down
10 changes: 5 additions & 5 deletions nmdc_runtime/site/export/ncbi_xml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
return biosample_data_objects


def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list):
def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
biosample_data_objects = []

for biosample in biosamples_list:
Expand All @@ -80,11 +80,11 @@ def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list)

for output_id in has_output:
if get_classname_from_typecode(output_id) == "DataObject":
omics_processing_doc = all_docs_collection.find_one(
nucleotide_sequencing_doc = all_docs_collection.find_one(
{"id": document["id"]}
)
if omics_processing_doc:
collected_data_objects.append(omics_processing_doc)
if nucleotide_sequencing_doc:
collected_data_objects.append(nucleotide_sequencing_doc)
else:
new_current_ids.append(output_id)

Expand Down Expand Up @@ -117,7 +117,7 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
for output_id in initial_output:
lib_prep_query = {
"has_input": output_id,
"designated_class": "nmdc:LibraryPreparation",
"type": {"$in": ["LibraryPreparation"]},
}
lib_prep_doc = all_docs_collection.find_one(lib_prep_query)

Expand Down
4 changes: 3 additions & 1 deletion nmdc_runtime/site/export/study_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,7 @@ def export_study_biosamples_metadata():
@op(required_resource_keys={"runtime_api_site_client"})
def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}")
biosamples = get_all_docs(
client, "biosamples", f"associated_studies:{nmdc_study['id']}"
)
return biosamples
8 changes: 5 additions & 3 deletions nmdc_runtime/site/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
materialize_alldocs,
get_ncbi_export_pipeline_study,
get_data_objects_from_biosamples,
get_omics_processing_from_biosamples,
get_nucleotide_sequencing_from_biosamples,
get_library_preparation_from_biosamples,
get_ncbi_export_pipeline_inputs,
ncbi_submission_xml_from_nmdc_study,
Expand Down Expand Up @@ -444,14 +444,16 @@ def nmdc_study_to_ncbi_submission_export():
nmdc_study = get_ncbi_export_pipeline_study()
ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
biosamples = get_biosamples_by_study_id(nmdc_study)
omics_processing_records = get_omics_processing_from_biosamples(biosamples)
nucleotide_sequencing_records = get_nucleotide_sequencing_from_biosamples(
biosamples
)
data_object_records = get_data_objects_from_biosamples(biosamples)
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
xml_data = ncbi_submission_xml_from_nmdc_study(
nmdc_study,
ncbi_submission_metadata,
biosamples,
omics_processing_records,
nucleotide_sequencing_records,
data_object_records,
library_preparation_records,
)
Expand Down
8 changes: 5 additions & 3 deletions nmdc_runtime/site/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
from nmdc_runtime.site.export.ncbi_xml_utils import (
fetch_data_objects_from_biosamples,
fetch_omics_processing_from_biosamples,
fetch_nucleotide_sequencing_from_biosamples,
fetch_library_preparation_from_biosamples,
)
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
Expand Down Expand Up @@ -1197,10 +1197,12 @@ def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: li


@op(required_resource_keys={"mongo"})
def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples: list):
def get_nucleotide_sequencing_from_biosamples(
context: OpExecutionContext, biosamples: list
):
mdb = context.resources.mongo.db
alldocs_collection = mdb["alldocs"]
biosample_omics_processing = fetch_omics_processing_from_biosamples(
biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
alldocs_collection, biosamples
)
return biosample_omics_processing
Expand Down
73 changes: 45 additions & 28 deletions tests/test_data/test_ncbi_xml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import Any, Callable, Generator
from unittest.mock import MagicMock
import pytest
import xml.etree.ElementTree as ET

from pytest_mock import MockerFixture

from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
from nmdc_runtime.site.export.ncbi_xml_utils import (
load_mappings,
Expand Down Expand Up @@ -122,19 +125,24 @@ def nmdc_biosample():


@pytest.fixture
def omics_processing_list():
def nucleotide_sequencing_list():
return [
{
"has_input": ["nmdc:procsm-12-ehktny16"],
"has_output": ["nmdc:dobj-12-1zv4q961", "nmdc:dobj-12-b3ft7a80"],
"id": "nmdc:omprc-12-zqm9p096",
"instrument_name": "Illumina NextSeq550",
"instrument_used": ["Illumina NextSeq550"],
"name": "Terrestrial soil microbial communities - ARIK.20150721.AMC.EPIPSAMMON.3-DNA1",
"ncbi_project_name": "PRJNA406976",
"omics_type": {"has_raw_value": "metagenome"},
"part_of": ["nmdc:sty-11-34xj1150"],
"associated_studies": ["nmdc:sty-11-34xj1150"],
"processing_institution": "Battelle",
"type": "nmdc:OmicsProcessing",
"analyte_category": "metagenome",
"type": [
"NucleotideSequencing",
"DataGeneration",
"PlannedProcess",
"NamedThing",
],
}
]

Expand Down Expand Up @@ -177,13 +185,13 @@ def library_preparation_dict():


class TestNCBISubmissionXML:
def test_set_element(self, ncbi_submission_client):
def test_set_element(self, ncbi_submission_client: NCBISubmissionXML):
element = ncbi_submission_client.set_element("Test", "Hello", {"attr": "value"})
assert element.tag == "Test"
assert element.text == "Hello"
assert element.attrib == {"attr": "value"}

def test_set_description(self, ncbi_submission_client):
def test_set_description(self, ncbi_submission_client: NCBISubmissionXML):
ncbi_submission_client.set_description(
ncbi_submission_client.nmdc_pi_email,
"Kate",
Expand All @@ -207,7 +215,7 @@ def test_set_description(self, ncbi_submission_client):
assert contact_first == "Kate"
assert contact_last == "Thibault"

def test_set_bioproject(self, ncbi_submission_client):
def test_set_bioproject(self, ncbi_submission_client: NCBISubmissionXML):
ncbi_submission_client.set_bioproject(
title=MOCK_NMDC_STUDY["title"],
project_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
Expand All @@ -230,7 +238,12 @@ def test_set_bioproject(self, ncbi_submission_client):
assert "metagenome" in bioproject_xml
assert "Test Org" in bioproject_xml

def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):
def test_set_biosample(
self,
ncbi_submission_client: NCBISubmissionXML,
nmdc_biosample: list[dict[str, Any]],
mocker: Callable[..., Generator[MockerFixture, None, None]],
):
mocker.patch(
"nmdc_runtime.site.export.ncbi_xml.load_mappings",
return_value=(
Expand Down Expand Up @@ -293,18 +306,19 @@ def test_set_biosample(self, ncbi_submission_client, nmdc_biosample, mocker):

def test_set_fastq(
self,
ncbi_submission_client,
nmdc_biosample,
data_objects_list,
omics_processing_list,
library_preparation_dict,
ncbi_submission_client: NCBISubmissionXML,
nmdc_biosample: list[dict[str, Any]],
data_objects_list: list[dict[str, str]],
nucleotide_sequencing_list: list[dict[str, Any]],
library_preparation_dict: dict[str, Any],
):
biosample_data_objects = [
{biosample["id"]: data_objects_list} for biosample in nmdc_biosample
]

biosample_omics_processing = [
{biosample["id"]: omics_processing_list} for biosample in nmdc_biosample
biosample_nucleotide_sequencing = [
{biosample["id"]: nucleotide_sequencing_list}
for biosample in nmdc_biosample
]

biosample_library_preparation = [
Expand All @@ -315,7 +329,7 @@ def test_set_fastq(
biosample_data_objects=biosample_data_objects,
bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
org="Test Org",
nmdc_omics_processing=biosample_omics_processing,
nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing,
nmdc_biosamples=nmdc_biosample,
nmdc_library_preparation=biosample_library_preparation,
)
Expand All @@ -335,7 +349,7 @@ def test_set_fastq(
# library Attributes in SRA <Action> block
assert "ILLUMINA" in action_xml
assert "NextSeq 550" in action_xml
assert "METAGENOMIC" in action_xml
# assert "METAGENOMIC" in action_xml
assert "RANDOM" in action_xml
assert "paired" in action_xml
assert "ARIK.20150721.AMC.EPIPSAMMON.3" in action_xml
Expand All @@ -344,12 +358,12 @@ def test_set_fastq(

def test_get_submission_xml(
self,
mocker,
ncbi_submission_client,
nmdc_biosample,
data_objects_list,
omics_processing_list,
library_preparation_dict,
mocker: Callable[..., Generator[MockerFixture, None, None]],
ncbi_submission_client: NCBISubmissionXML,
nmdc_biosample: list[dict[str, Any]],
data_objects_list: list[dict[str, str]],
nucleotide_sequencing_list: list[dict[str, Any]],
library_preparation_dict: dict[str, Any],
):
mocker.patch(
"nmdc_runtime.site.export.ncbi_xml.load_mappings",
Expand Down Expand Up @@ -399,8 +413,9 @@ def test_get_submission_xml(
{biosample["id"]: data_objects_list} for biosample in nmdc_biosample
]

biosample_omics_prcessing = [
{biosample["id"]: omics_processing_list} for biosample in nmdc_biosample
biosample_nucleotide_sequencing = [
{biosample["id"]: nucleotide_sequencing_list}
for biosample in nmdc_biosample
]

biosample_library_preparation = [
Expand All @@ -411,7 +426,7 @@ def test_get_submission_xml(
biosample_data_objects=biosample_data_objects,
bioproject_id=MOCK_NMDC_STUDY["insdc_bioproject_identifiers"][0],
org="Test Org",
nmdc_omics_processing=biosample_omics_prcessing,
nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing,
nmdc_biosamples=nmdc_biosample,
nmdc_library_preparation=biosample_library_preparation,
)
Expand Down Expand Up @@ -519,7 +534,9 @@ def test_handle_float_value(self):
def test_handle_string_value(self):
assert handle_string_value("Foo") == "Foo"

def test_load_mappings(self, mocker):
def test_load_mappings(
self, mocker: Callable[..., Generator[MockerFixture, None, None]]
):
mock_tsv_content = (
"nmdc_schema_class\tnmdc_schema_slot\tnmdc_schema_slot_range\tncbi_biosample_attribute_name\tstatic_value\tignore\n"
"Biosample\tanalysis_type\tAnalysisTypeEnum\t\t\t\n"
Expand Down

0 comments on commit 16f2523

Please sign in to comment.