Skip to content

Commit

Permalink
[WIP] Feature/docref additions (#98)
Browse files Browse the repository at this point in the history
* Add system level category key/value for muli element codings

* Add support for multiple identifiers

* update research subject to use SimplifiedResource

* temp commit

* initial research subject update, requires Condition.ndjson for pytest fixture

* add SimplifiedCondition to override codings() method

* update research subject to use SimplifiedResource

* rebase conflict resolve

* test fixture for condition and associated references (#96)

Includes new Condition, Observation and Encounter metadata

* simplify pulling observation values; clarify get_resource_by_reference

* simplify dataframer command line args

* add SimplifiedCondition to override codings() method

* fix pytests (still need to test new condition fixture); linting

* Add system level category key/value for muli element codings

* add SimplifiedCondition to override codings() method

* fix tests

* fix tests

* make data loadable

* Add observation code

* enable multiple focus references; allow specified dataframer output_path

* improve error handling

* improve tests and write logs to file

* add expected invalid docref fixture

* Preserve functionality through backend bulk loader swap

---------

Co-authored-by: quinnwai <quinnwai.wong@gmail.com>
Co-authored-by: Nasim Sanati (Rieker) <nasim@plenary.org>
  • Loading branch information
3 people authored Dec 2, 2024
1 parent 94281ad commit 04d6e8b
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 53 deletions.
15 changes: 13 additions & 2 deletions gen3_tracker/gen3/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from gen3.auth import Gen3Auth
from gen3.jobs import Gen3Jobs
import pytz

from gen3_tracker import Config
from gen3_tracker.common import Push, Commit
Expand Down Expand Up @@ -129,11 +130,13 @@ def publish_commits(config: Config, wait: bool, auth: Gen3Auth, bucket_name: str
from cdislogging import get_logger # noqa
cdis_logging = get_logger("__name__")
cdis_logging.setLevel(logging.WARN)

if wait:
# async_run_job_and_wait monkeypatched below
_ = asyncio.run(jobs_client.async_run_job_and_wait(job_name='fhir_import_export', job_input=args, spinner=spinner))
else:
_ = jobs_client.create_job('fhir_import_export', args)

if not isinstance(_, dict):
_ = {'output': _}
if isinstance(_['output'], str):
Expand Down Expand Up @@ -183,7 +186,15 @@ async def async_run_job_and_wait(self, job_name, job_input, spinner=None, _ssl=N
spinner.text = f"{status.get('name')} {status.get('status')}"

if status.get("status") != "Completed":
raise Exception(f"Job status not complete: {status.get('status')}.")
# write failed output to log file before raising exception
response = await self.async_get_output(job_create_response.get("uid"))
with open("logs/publish.log", 'a') as f:
log_msg = {'timestamp': datetime.now(pytz.UTC).isoformat()}
log_msg.update(response)
f.write(json.dumps(log_msg, separators=(',', ':')))
f.write('\n')

raise Exception(f"Job status not complete: {status.get('status')}")

response = await self.async_get_output(job_create_response.get("uid"))
return response
Expand Down
18 changes: 13 additions & 5 deletions gen3_tracker/git/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def push(ctx, step: str, transfer_method: str, overwrite: bool, re_run: bool, wa
headers = {"Authorization": f"{auth._access_token}"}
result = requests.delete(url=f'{auth.endpoint}/Bundle', data=orjson.dumps(bundle_data, default=_default_json_serializer,
option=orjson.OPT_APPEND_NEWLINE).decode(), headers=headers)

with open("logs/publish.log", 'a') as f:
log_msg = {'timestamp': datetime.now(pytz.UTC).isoformat(), "result": f"{result}"}
click.secho('Published project. See logs/publish.log', fg=SUCCESS_COLOR, file=sys.stderr)
Expand Down Expand Up @@ -489,20 +489,28 @@ def push(ctx, step: str, transfer_method: str, overwrite: bool, re_run: bool, wa
return

if step in ['publish', 'all'] and not fhir_server:
log_path = "logs/publish.log"

with Halo(text='Uploading snapshot', spinner='line', placement='right', color='white'):
# push the snapshot of the `.git` sub-directory in the current directory
push_snapshot(config, auth=auth)

if transfer_method == 'gen3':
with Halo(text='Publishing', spinner='line', placement='right', color='white') as spinner:
try:
# legacy, "old" fhir_import_export use publish_commits to publish the META
_ = publish_commits(config, wait=wait, auth=auth, bucket_name=bucket_name, spinner=spinner)
click.secho('Published project. See logs/publish.log', fg=SUCCESS_COLOR, file=sys.stderr)
with open("logs/publish.log", 'a') as f:
with Halo(text='Publishing', spinner='line', placement='right', color='white') as spinner:
_ = publish_commits(config, wait=wait, auth=auth, bucket_name=bucket_name, spinner=spinner)
except Exception as e:
click.secho(f'Unable to publish project. See {log_path} for more info', fg=ERROR_COLOR, file=sys.stderr)
raise e

# print success message and save logs
with open(log_path, 'a') as f:
log_msg = {'timestamp': datetime.now(pytz.UTC).isoformat()}
log_msg.update(_)
f.write(json.dumps(log_msg, separators=(',', ':')))
f.write('\n')
click.secho(f'Published project. Logs found at {log_path}', fg=SUCCESS_COLOR, file=sys.stderr)
else:
click.secho(f'Auto-publishing not supported for {transfer_method}. Please use --step publish after uploading', fg=ERROR_COLOR, file=sys.stderr)

Expand Down
16 changes: 11 additions & 5 deletions gen3_tracker/meta/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,16 @@ def render_graph(config: Config, directory_path: str, output_path: str, browser:


@meta.command("dataframe")
@click.argument('data_type',
required=True,
type=click.Choice(['Specimen', 'DocumentReference', 'ResearchSubject']),
default=None)
@click.argument("directory_path",
type=click.Path(exists=True, file_okay=False),
default="./META", required=False)
@click.argument("output_path",
type=click.Path(file_okay=True),
default="meta.csv", required=False)
type=click.Path(file_okay=True), required=False)
@click.option('--dtale', 'launch_dtale', default=False, show_default=True, is_flag=True, help='Open the graph in a browser using the dtale package for interactive data exploration.')
@click.option('--data_type', required=True, type=click.Choice(['Specimen', 'DocumentReference', 'ResearchSubject']), default=None, show_default=True, help='Create a data frame for a specific data type.')
@click.option('--debug', is_flag=True)
@click.pass_obj
def render_df(config: Config, directory_path: str, output_path: str, launch_dtale: bool, data_type: str, debug: bool):
Expand All @@ -127,9 +129,13 @@ def render_df(config: Config, directory_path: str, output_path: str, launch_dtal
dtale.show(df, subprocess=False, open_browser=True, port=40000)
else:
# export to csv
df.to_csv(output_path, index=False)
click.secho(f"Saved {output_path}", fg=INFO_COLOR, file=sys.stderr)
file_name = output_path if output_path else f"{data_type}.csv"
df.to_csv(file_name, index=False)
click.secho(f"Saved {file_name}", fg=INFO_COLOR, file=sys.stderr)
except Exception as e:
click.secho(str(e), fg=ERROR_COLOR, file=sys.stderr)
if config.debug or debug:
raise


meta.add_command(render_df, name='df')
27 changes: 12 additions & 15 deletions gen3_tracker/meta/dataframer.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,13 +405,11 @@ def handle_units(self, value_normalized: str):
string and float data in the same column and gives errors because it is expecting
only one data type per column"""

print("UNITS: ", value_normalized)
if value_normalized is not None:
value_normalized_split = value_normalized.split(" ")
if isinstance(value_normalized_split, list):
value_numeric = value_normalized_split[0]
if is_number(value_numeric):
# print("VALUE NUMERIC: ", float(value_numeric))
value_normalized = float(value_numeric)
return value_normalized
return None
Expand Down Expand Up @@ -699,20 +697,19 @@ def get_resources_by_reference(

# determine which how to process the field
if reference_field == "focus":
# error if multiple focuses
if reference_field in resource:
assert (
len(resource["focus"]) <= 1
), "unable to support more than 1 focus for a single observation"
nested_keys = ["focus", 0]
elif reference_field == "subject":
nested_keys = ["subject"]
# add the resource (eg observation) for each focus reference to the dict
for i in range(len(resource["focus"])):
reference_key = get_nested_value(resource, [reference_field, i, "reference"])
if reference_key is not None and reference_type in reference_key:
reference_id = reference_key.split("/")[-1]
resource_by_reference_id[reference_id].append(resource)

# add observation to dict if a reference resource exists
reference_key = get_nested_value(resource, [*nested_keys, "reference"])
if reference_key is not None and reference_type in reference_key:
reference_id = reference_key.split("/")[-1]
resource_by_reference_id[reference_id].append(resource)
elif reference_field == "subject":
# add the resource (eg observation) to the dict
reference_key = get_nested_value(resource, [reference_field, "reference"])
if reference_key is not None and reference_type in reference_key:
reference_id = reference_key.split("/")[-1]
resource_by_reference_id[reference_id].append(resource)

return resource_by_reference_id

Expand Down
35 changes: 23 additions & 12 deletions gen3_tracker/meta/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pydantic import BaseModel, computed_field
from typing import Dict, List, Optional, Tuple


#######################
# FHIR HELPER METHODS #
#######################
Expand Down Expand Up @@ -185,7 +186,7 @@ class SimplifiedFHIR(BaseModel):
@computed_field()
@property
def simplified(self) -> dict:
_ = {"identifier": self.identifier.get("value", None)}
_ = self.identifiers.copy() if self.identifiers else {}
_.update(self.scalars)
_.update(self.codings)
_.update(self.extensions)
Expand Down Expand Up @@ -245,6 +246,7 @@ def scalars(self) -> dict:
if (not isinstance(v, list) and not isinstance(v, dict))
}


@computed_field
@property
def codings(self) -> dict:
Expand All @@ -259,25 +261,31 @@ def codings(self) -> dict:
if isinstance(elem, dict):
# TODO: implement hierarchy of codes rather than just taking last code?
for value, source in normalize_coding(elem):
_codings[k] = value
if len(v) > 1 and get_nested_value(elem, [source, 0, 'system']):
_codings[elem[source][0]["system"].split("/")[-1]] = value
else:
_codings[k] = value
elif isinstance(v, dict):
for value, elem in normalize_coding(v):
_codings[k] = value

return _codings

@computed_field
@property
def identifier(self) -> dict:
"""Return the official identifier, or first of a resource."""
def identifiers(self) -> dict:
"""Return the first of a resource and any other resources"""
identifiers = self.resource.get("identifier", [])
official_identifiers = [
_ for _ in identifiers if _.get("use", "") == "official"
]
if not official_identifiers and identifiers:
return identifiers[0]
elif official_identifiers:
return official_identifiers[0]
identifiers_len = len(identifiers)

if not identifiers_len:
return {"identifier": None}
elif identifiers_len == 1:
return {"identifier": identifiers[0].get('value')}
else:
return {}
base_identifier = {"identifier": identifiers[0].get('value')}
base_identifier.update({identifier.get("system").split("/")[-1]: identifier.get("value") for identifier in identifiers[1:]})
return base_identifier

@computed_field
@property
Expand Down Expand Up @@ -364,6 +372,9 @@ def values(self) -> dict:
if not value:
continue
_values[source] = value
if "code" in self.resource and "text" in self.resource["code"]:
_values["observation_code"] = self.resource["code"]["text"]


assert len(_values) > 0, f"no values found in Observation: {self.resource}"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"resourceType":"DocumentReference","id":"6b024bdd-810f-5af0-8f36-9effc5da9b02","identifier":[{"system":"https://gdc.cancer.gov/file_id","value":"adb14cb5-6c6a-4c9a-9611-4f560e696e21"}],"version":"2","basedOn":[{"reference":"Specimen/91749e5f-aff9-5da6-a396-e53da6572bb7"},{"reference":"Specimen/ef84de6b-474e-5f96-a22f-2f105e4006ce"}],"status":"current","type":{"coding":[{"system":"https://gdc.cancer.gov/data_type","code":"VCF","display":"VCF"}]},"category":[{"coding":[{"system":"https://gdc.cancer.gov/data_category","code":"Simple Nucleotide Variation","display":"Simple Nucleotide Variation"}]},{"coding":[{"system":"https://gdc.cancer.gov/experimental_strategy","code":"WXS","display":"WXS"}]}],"subject":{"reference":"Patient/ccf801ae-7d28-5446-9c98-d79185ce2eb1"},"date":"2022-02-03T20:41:18.844488-06:00","content":[{"attachment":{"contentType":"Annotated Somatic Mutation","url":"https://api.gdc.cancer.gov/data/adb14cb5-6c6a-4c9a-9611-4f560e696e21","size":192763,"hash":"e389d0afe4c523b366853d32d89455ad","title":"TCGA_LUAD.c78eaf74-7747-40c3-be4d-0065e4e4d0f2.wxs.VarScan2.somatic_annotation.vcf.gz", "creation": "2023-10-05T12:07:50"},"profile":[{"valueCoding":{"system":"https://gdc.cancer.gov/data_format","code":"VCF","display":"VCF"}}]}]}
Loading

0 comments on commit 04d6e8b

Please sign in to comment.