Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve testing of round trip serialization #480

Merged
merged 30 commits into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
831a58e
Convert SSSOM TSV => JSON => TSV and confirm the MappingSetDataFrame …
hrshdhgd Sep 26, 2023
1854d29
Add more explicit tests
cthoyt Sep 26, 2023
e00f4d4
Merge branch 'master' into issue-321
hrshdhgd Oct 3, 2023
d8db602
Merge branch 'master' into issue-321
hrshdhgd Dec 13, 2023
8f9a334
SSSOM tsv => json => tsv fixed.
hrshdhgd Dec 14, 2023
9702d8c
Now gets both metadata and prefix_map from JSON
hrshdhgd Dec 14, 2023
7ca9934
undo changes for util.py
hrshdhgd Dec 14, 2023
43e4555
undo changes for util.py
hrshdhgd Dec 14, 2023
b117b38
not needed
hrshdhgd Dec 14, 2023
f03b612
undo util change
hrshdhgd Dec 14, 2023
2eac05b
Refactor @context => JSON_CONTEXT_KEY
hrshdhgd Dec 14, 2023
4c480a5
changed @context backto what it was
hrshdhgd Dec 14, 2023
fc8c43c
Use all attr of MappingSet instead of the previous two
hrshdhgd Dec 14, 2023
f87db65
Update parsers.py
matentzn Jan 4, 2024
a5c6509
Update test_parsers.py
matentzn Jan 4, 2024
93bbe68
Add roundtrip test for RDF parsing as well
matentzn Jan 4, 2024
c7bb6c8
Update test_parsers.py
matentzn Jan 5, 2024
2081f47
Update tests
cthoyt Jan 5, 2024
1bf8756
Add TSV test and minor refactor
cthoyt Jan 5, 2024
f72ff1a
Remove redundant test
cthoyt Jan 5, 2024
70325d3
Update test_parsers.py
cthoyt Jan 5, 2024
5bcc9fb
Refactor the rdf parser to use linkml
matentzn Jan 12, 2024
655012b
Some linting
matentzn Jan 12, 2024
e21ab07
Ignore unmapped predicates
matentzn Jan 12, 2024
77dee25
Update parsers.py
cthoyt Jan 13, 2024
3a679a1
Fix bimap usage
cthoyt Jan 13, 2024
249ee1c
Add missing field to RDF
cthoyt Jan 13, 2024
52cb27f
Fix broken oio references
cthoyt Jan 13, 2024
de4f9c6
Update broken test data in rdf
matentzn Jan 13, 2024
556728e
Update wrong test count
matentzn Jan 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 66 additions & 17 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
CONFIDENCE,
CURIE_MAP,
DEFAULT_MAPPING_PROPERTIES,
LICENSE,
MAPPING_JUSTIFICATION,
MAPPING_JUSTIFICATION_UNSPECIFIED,
MAPPING_SET_ID,
OBJECT_ID,
OBJECT_LABEL,
OBJECT_SOURCE,
Expand Down Expand Up @@ -235,10 +237,27 @@ def parse_sssom_rdf(
) -> MappingSetDataFrame:
"""Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
raise_for_bad_path(file_path)
converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

g = Graph()
g.parse(file_path, format=serialisation)

# Initialize meta if it's None
if meta is None:
meta = {}

# The priority order for combining prefix maps are:
# 1. Built-in prefix map
# 2. Internal prefix map inside the document
# 3. Prefix map passed through this function inside the ``meta``
# 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
converter = curies.chain(
[
_get_built_in_prefix_map(),
Converter.from_rdflib(g),
Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
ensure_converter(prefix_map, use_defaults=False),
]
)
msdf = from_sssom_rdf(g, prefix_map=converter, meta=meta)
# df: pd.DataFrame = msdf.df
# if mapping_predicates and not df.empty():
Expand All @@ -247,22 +266,33 @@ def parse_sssom_rdf(


def parse_sssom_json(
file_path: str,
prefix_map: ConverterHint = None,
meta: Optional[MetadataType] = None,
**kwargs
# mapping_predicates: Optional[List[str]] = None,
file_path: str, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, **kwargs
) -> MappingSetDataFrame:
"""Parse a TSV to a :class:`MappingSetDocument` to a :class`MappingSetDataFrame`."""
"""Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
raise_for_bad_path(file_path)
converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

with open(file_path) as json_file:
jsondoc = json.load(json_file)

# Initialize meta if it's None
if meta is None:
meta = {}

# The priority order for combining prefix maps are:
# 1. Built-in prefix map
# 2. Internal prefix map inside the document
# 3. Prefix map passed through this function inside the ``meta``
# 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
converter = curies.chain(
[
_get_built_in_prefix_map(),
Converter.from_jsonld(file_path),
Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
ensure_converter(prefix_map, use_defaults=False),
]
)

msdf = from_sssom_json(jsondoc=jsondoc, prefix_map=converter, meta=meta)
# df: pd.DataFrame = msdf.df
# if mapping_predicates and not df.empty():
# msdf.df = df[df["predicate_id"].isin(mapping_predicates)]
return msdf


Expand Down Expand Up @@ -323,9 +353,7 @@ def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]:

def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet:
_metadata = dict(ChainMap(meta or {}, get_default_metadata()))
mapping_set = MappingSet(
mapping_set_id=_metadata["mapping_set_id"], license=_metadata["license"]
)
mapping_set = MappingSet(mapping_set_id=_metadata[MAPPING_SET_ID], license=_metadata[LICENSE])
_set_metadata_in_mapping_set(mapping_set=mapping_set, metadata=meta)
return mapping_set

Expand Down Expand Up @@ -477,13 +505,28 @@ def from_sssom_json(

:param jsondoc: JSON document
:param prefix_map: Prefix map
:param meta: metadata
:param meta: metadata used to augment the metadata existing in the mapping set
:return: MappingSetDataFrame object
"""
converter = ensure_converter(prefix_map)

mapping_set = cast(MappingSet, JSONLoader().load(source=jsondoc, target_class=MappingSet))

_set_metadata_in_mapping_set(mapping_set, metadata=meta)
# The priority order for combining metadata is:
# 1. Metadata appearing in the SSSOM document
# 2. Metadata passed through ``meta`` to this function
# 3. Default metadata

# As the Metadata appearing in the SSSOM document is already parsed by LinkML
# we only need to overwrite the metadata from 2 and 3 if it is not present
combine_meta = dict(
ChainMap(
meta or {},
get_default_metadata(),
)
)

_set_metadata_in_mapping_set(mapping_set, metadata=combine_meta, overwrite=False)
mapping_set_document = MappingSetDocument(mapping_set=mapping_set, converter=converter)
return to_mapping_set_dataframe(mapping_set_document)

Expand Down Expand Up @@ -735,13 +778,19 @@ def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:


def _set_metadata_in_mapping_set(
mapping_set: MappingSet, metadata: Optional[MetadataType] = None
mapping_set: MappingSet, metadata: Optional[MetadataType] = None, overwrite: bool = True
) -> None:
if metadata is None:
logging.info("Tried setting metadata but none provided.")
else:
for k, v in metadata.items():
if k != CURIE_MAP:
if (
hasattr(mapping_set, k)
and getattr(mapping_set, k) is not None
and not overwrite
):
continue
mapping_set[k] = _address_multivalued_slot(k, v)


Expand Down
1 change: 1 addition & 0 deletions tests/data/basic_subset.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# b: "http://example.org/b/"
# c: "http://example.org/c/"
# d: "http://example.org/d/"
# orcid: "https://orcid.org/"
subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment
x:appendage appendage owl:equivalentClass y:appendage appendages semapv:ManualMappingCuration x y rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag .
x:appendage appendage owl:equivalentClass z:appendage APPENDAGE semapv:ManualMappingCuration x z rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag .
95 changes: 92 additions & 3 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import json
import math
import os
import tempfile
import unittest
from collections import ChainMap
from pathlib import Path
from tempfile import TemporaryDirectory
from textwrap import dedent
from xml.dom import minidom

Expand All @@ -27,10 +28,11 @@
from_sssom_dataframe,
from_sssom_json,
from_sssom_rdf,
parse_sssom_json,
parse_sssom_table,
)
from sssom.util import MappingSetDataFrame, sort_df_rows_columns
from sssom.writers import write_table
from sssom.writers import write_json, write_table
from tests.test_data import data_dir as test_data_dir
from tests.test_data import test_out_dir

Expand Down Expand Up @@ -378,7 +380,7 @@ def test_round_trip(self):
set(msdf.prefix_map),
)

with tempfile.TemporaryDirectory() as directory:
with TemporaryDirectory() as directory:
directory = Path(directory)
path = directory.joinpath("test.sssom.tsv")
with path.open("w") as file:
Expand Down Expand Up @@ -410,3 +412,90 @@ def test_round_trip(self):

# This checks that nothing funny gets added unexpectedly
self.assertEqual(expected_prefix_map, reconsitited_msdf.prefix_map)

def test_round_trip_json_tsv(self):
"""Test TSV => JSON => TSV using convert() + parse()."""
rows = [
(
"DOID:0050601",
"ADULT syndrome",
"skos:exactMatch",
"UMLS:C1863204",
"ADULT SYNDROME",
"semapv:ManualMappingCuration",
"orcid:0000-0003-4423-4370",
)
]
columns = [
"subject_id",
"subject_label",
"predicate_id",
"object_id",
"object_label",
"mapping_justification",
"creator_id",
]

df = pd.DataFrame(rows, columns=columns)
msdf = MappingSetDataFrame(df=df, converter=ensure_converter())
msdf.clean_prefix_map(strict=True)

#: This is a set of the prefixes that explicitly are used in this
#: example. SSSOM-py also adds the remaining builtin prefixes from
#: :data:`sssom.context.SSSOM_BUILT_IN_PREFIXES`, which is reflected
#: in the formulation of the test expectation below
explicit_prefixes = {"DOID", "semapv", "orcid", "skos", "UMLS"}
self.assertEqual(
explicit_prefixes.union(SSSOM_BUILT_IN_PREFIXES),
set(msdf.prefix_map),
)

with TemporaryDirectory() as directory:
directory = Path(directory)
path = directory.joinpath("test.sssom.json")
with path.open("w") as file:
write_json(msdf, file)

reconsitited_msdf = parse_sssom_json(path)
reconsitited_msdf.clean_prefix_map(strict=True)

test_meta = {
"mapping_set_title": "A title",
"license": "https://w3id.org/sssom/license/test",
}

reconsitited_msdf_with_meta = parse_sssom_json(path, meta=test_meta)
reconsitited_msdf_with_meta.clean_prefix_map(strict=True)

# Ensure the prefix maps are equal after json parsing and cleaning
self.assertEqual(
msdf.prefix_map,
reconsitited_msdf.prefix_map,
)

# Ensure the shape, labels, and values in the data frame are the same after json parsing and cleaning
self.assertTrue(
msdf.df.equals(reconsitited_msdf.df),
)

# Ensure the metadata is the same after json parsing and cleaning
self.assertEqual(
msdf.metadata,
reconsitited_msdf.metadata,
)

combine_meta = dict(
ChainMap(
msdf.metadata,
test_meta,
)
)

# Ensure the metadata after json parsing with additional metadata corresponds to
# a chain of the original metadata with the test metadata.
# In particular, this ensures that fields in the test metadata provided are added
# to the MappingSet if they are not present, but not updated if they are already present.
self.assertEqual(
combine_meta,
reconsitited_msdf_with_meta.metadata,
)