Skip to content

Commit

Permalink
refactor: rename inSilicoPredictors to variantEffect
Browse files Browse the repository at this point in the history
  • Loading branch information
vivienho committed Feb 7, 2025
1 parent b86a850 commit 8383ba5
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 71 deletions.
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/variant_index.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
},
{
"metadata": {},
"name": "inSilicoPredictors",
"name": "variantEffect",
"nullable": true,
"type": {
"containsNull": true,
Expand Down
48 changes: 24 additions & 24 deletions src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def add_annotation(
"""Import annotation from an other variant index dataset.
At this point the annotation can be extended with extra cross-references,
in-silico predictions, allele frequencies, and variant descriptions.
variant effects, allele frequencies, and variant descriptions.
Args:
annotation_source (VariantIndex): Annotation to add to the dataset
Expand Down Expand Up @@ -305,33 +305,33 @@ def get_loftee(self: VariantIndex) -> DataFrame:
)


class InSilicoPredictorNormaliser:
"""Class to normalise in silico predictor assessments.
class VariantEffectNormaliser:
"""Class to normalise variant effect assessments.
Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
value to the in silico predictor struct.
value to the variant effect struct.
The higher negative values indicate increasingly confident prediction to be a benign variant,
while the higher positive values indicate increasingly deleterious predicted effect.
The point of these operations to make the scores comparable across different in silico predictors.
The point of these operations to make the scores comparable across different variant effect assessments.
"""

@classmethod
def normalise_in_silico_predictors(
cls: type[InSilicoPredictorNormaliser],
in_silico_predictors: Column,
def normalise_variant_effect(
cls: type[VariantEffectNormaliser],
variant_effect: Column,
) -> Column:
"""Normalise in silico predictors. Appends a normalised score to the in silico predictor struct.
"""Normalise variant effect assessments. Appends a normalised score to the variant effect struct.
Args:
in_silico_predictors (Column): Column containing in silico predictors (list of structs).
variant_effect (Column): Column containing variant effect assessments (list of structs).
Returns:
Column: Normalised in silico predictors.
Column: Normalised variant effect assessments.
"""
return f.transform(
in_silico_predictors,
variant_effect,
lambda predictor: f.struct(
# Extracing all existing columns:
predictor.method.alias("method"),
Expand All @@ -348,20 +348,20 @@ def normalise_in_silico_predictors(

@classmethod
def resolve_predictor_methods(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
method: Column,
assessment: Column,
) -> Column:
"""It takes a score, a method, and an assessment, and returns a normalized score for the in silico predictor.
"""It takes a score, a method, and an assessment, and returns a normalized score for the variant effect.
Args:
score (Column): The raw score from the in silico predictor.
score (Column): The raw score from the variant effect.
method (Column): The method used to generate the score.
assessment (Column): The assessment of the score.
Returns:
Column: Normalised score for the in silico predictor.
Column: Normalised score for the variant effect.
"""
return (
f.when(method == "LOFTEE", cls._normalise_loftee(assessment))
Expand Down Expand Up @@ -403,7 +403,7 @@ def _rescaleColumnValue(

@classmethod
def _normalise_cadd(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise CADD scores.
Expand All @@ -429,7 +429,7 @@ def _normalise_cadd(

@classmethod
def _normalise_gerp(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise GERP scores.
Expand Down Expand Up @@ -461,7 +461,7 @@ def _normalise_gerp(

@classmethod
def _normalise_lof(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
assessment: Column,
) -> Column:
"""Normalise loss-of-function verdicts.
Expand Down Expand Up @@ -490,7 +490,7 @@ def _normalise_lof(

@classmethod
def _normalise_loftee(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
assessment: Column,
) -> Column:
"""Normalise LOFTEE scores.
Expand All @@ -512,7 +512,7 @@ def _normalise_loftee(

@classmethod
def _normalise_sift(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
assessment: Column,
) -> Column:
Expand Down Expand Up @@ -556,7 +556,7 @@ def _normalise_sift(

@classmethod
def _normalise_polyphen(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
assessment: Column,
score: Column,
) -> Column:
Expand Down Expand Up @@ -587,7 +587,7 @@ def _normalise_polyphen(

@classmethod
def _normalise_alpha_missense(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise AlphaMissense scores.
Expand All @@ -611,7 +611,7 @@ def _normalise_alpha_missense(

@classmethod
def _normalise_pangolin(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise Pangolin scores.
Expand Down
50 changes: 25 additions & 25 deletions src/gentropy/datasource/ensembl/vep_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
order_array_of_structs_by_field,
order_array_of_structs_by_two_fields,
)
from gentropy.dataset.variant_index import InSilicoPredictorNormaliser, VariantIndex
from gentropy.dataset.variant_index import VariantEffectNormaliser, VariantIndex

if TYPE_CHECKING:
from pyspark.sql import Column, DataFrame
Expand All @@ -33,9 +33,9 @@ class VariantEffectPredictorParser:

DBXREF_SCHEMA = VariantIndex.get_schema()["dbXrefs"].dataType

# Schema description of the in silico predictor object:
IN_SILICO_PREDICTOR_SCHEMA = get_nested_struct_schema(
VariantIndex.get_schema()["inSilicoPredictors"]
# Schema description of the variant effect object:
VARIANT_EFFECT_SCHEMA = get_nested_struct_schema(
VariantIndex.get_schema()["variantEffect"]
)

# Schema for the allele frequency column:
Expand Down Expand Up @@ -341,7 +341,7 @@ def _get_most_severe_transcript(
)[0]

@classmethod
@enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
@enforce_schema(VARIANT_EFFECT_SCHEMA)
def _get_vep_prediction(cls, most_severe_consequence: Column) -> Column:
return f.struct(
f.lit("VEP").alias("method"),
Expand All @@ -352,7 +352,7 @@ def _get_vep_prediction(cls, most_severe_consequence: Column) -> Column:
)

@staticmethod
@enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
@enforce_schema(VARIANT_EFFECT_SCHEMA)
def _get_max_alpha_missense(transcripts: Column) -> Column:
"""Return the most severe alpha missense prediction from all transcripts.
Expand Down Expand Up @@ -410,26 +410,26 @@ def _get_max_alpha_missense(transcripts: Column) -> Column:
)

@classmethod
@enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
def _vep_in_silico_prediction_extractor(
@enforce_schema(VARIANT_EFFECT_SCHEMA)
def _vep_variant_effect_extractor(
cls: type[VariantEffectPredictorParser],
transcript_column_name: str,
method_name: str,
score_column_name: str | None = None,
assessment_column_name: str | None = None,
assessment_flag_column_name: str | None = None,
) -> Column:
"""Extract in silico prediction from VEP output.
"""Extract variant effect from VEP output.
Args:
transcript_column_name (str): Name of the column containing the list of transcripts.
method_name (str): Name of the in silico predictor.
method_name (str): Name of the variant effect.
score_column_name (str | None): Name of the column containing the score.
assessment_column_name (str | None): Name of the column containing the assessment.
assessment_flag_column_name (str | None): Name of the column containing the assessment flag.
Returns:
Column: In silico predictor.
Column: Variant effect.
"""
# Get transcript with the highest score:
most_severe_transcript: Column = (
Expand Down Expand Up @@ -634,42 +634,42 @@ def process_vep_output(
cls._extract_clinvar_xrefs(f.col("colocated_variants")).alias(
"clinvar_xrefs"
),
# Extracting in silico predictors
# Extracting variant effect assessments
f.when(
# The following in-silico predictors are only available for variants with transcript consequences:
# The following variant effect assessments are only available for variants with transcript consequences:
f.col("transcript_consequences").isNotNull(),
f.filter(
f.array(
# Extract CADD scores:
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
transcript_column_name="transcript_consequences",
method_name="CADD",
score_column_name="cadd_phred",
),
# Extract polyphen scores:
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
transcript_column_name="transcript_consequences",
method_name="PolyPhen",
score_column_name="polyphen_score",
assessment_column_name="polyphen_prediction",
),
# Extract sift scores:
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
transcript_column_name="transcript_consequences",
method_name="SIFT",
score_column_name="sift_score",
assessment_column_name="sift_prediction",
),
# Extract loftee scores:
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
method_name="LOFTEE",
transcript_column_name="transcript_consequences",
score_column_name="lof",
assessment_column_name="lof",
assessment_flag_column_name="lof_filter",
),
# Extract GERP conservation score:
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
method_name="GERP",
transcript_column_name="transcript_consequences",
score_column_name="conservation",
Expand All @@ -687,13 +687,13 @@ def process_vep_output(
.otherwise(
# Extract CADD scores from intergenic object:
f.array(
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
transcript_column_name="intergenic_consequences",
method_name="CADD",
score_column_name="cadd_phred",
),
# Extract GERP conservation score:
cls._vep_in_silico_prediction_extractor(
cls._vep_variant_effect_extractor(
method_name="GERP",
transcript_column_name="intergenic_consequences",
score_column_name="conservation",
Expand All @@ -702,7 +702,7 @@ def process_vep_output(
cls._get_vep_prediction(f.col("most_severe_consequence")),
)
)
.alias("inSilicoPredictors"),
.alias("variantEffect"),
# Convert consequence to SO:
map_column_by_dictionary(
f.col("most_severe_consequence"), cls.SEQUENCE_ONTOLOGY_MAP
Expand Down Expand Up @@ -882,11 +882,11 @@ def process_vep_output(
)[f.size("proteinCodingTranscripts") - 1],
),
)
# Normalising in silico predictor assessments:
# Normalising variant effect assessments:
.withColumn(
"inSilicoPredictors",
InSilicoPredictorNormaliser.normalise_in_silico_predictors(
f.col("inSilicoPredictors")
"variantEffect",
VariantEffectNormaliser.normalise_variant_effect(
f.col("variantEffect")
),
)
# Dropping intermediate xref columns:
Expand Down
16 changes: 8 additions & 8 deletions src/gentropy/datasource/open_targets/lof_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pyspark.sql.types as t

from gentropy.common.spark_helpers import enforce_schema
from gentropy.dataset.variant_index import InSilicoPredictorNormaliser, VariantIndex
from gentropy.dataset.variant_index import VariantEffectNormaliser, VariantIndex

if TYPE_CHECKING:
from pyspark.sql import Column, DataFrame
Expand All @@ -17,20 +17,20 @@
class OpenTargetsLOF:
"""Class to parse Loss-of-Function variant data from Open Targets Project OTAR2075."""

IN_SILICO_PREDICTOR_SCHEMA = VariantIndex.get_schema()[
"inSilicoPredictors"
VARIANT_EFFECT_SCHEMA = VariantIndex.get_schema()[
"variantEffect"
].dataType.elementType

@staticmethod
@enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
@enforce_schema(VARIANT_EFFECT_SCHEMA)
def _get_lof_assessment(verdict: Column) -> Column:
"""Get curated Loss-of-Function assessment from verdict column.
Args:
verdict (Column): verdict column from the input dataset.
Returns:
Column: struct following the in silico predictor schema.
Column: struct following the variant effect schema.
"""
return f.struct(
f.lit("LossOfFunctionCuration").alias("method"),
Expand Down Expand Up @@ -87,12 +87,12 @@ def as_variant_index(
f.col("h38.pos").cast(t.IntegerType()).alias("position"),
f.col("h37.ref").alias("referenceAllele"),
f.col("h37.alt").alias("alternateAllele"),
# Populate inSilicoPredictors field:
f.array(cls._get_lof_assessment(f.col("Verdict"))).alias("inSilicoPredictors"),
# Populate variantEffect and variantDescription fields:
f.array(cls._get_lof_assessment(f.col("Verdict"))).alias("variantEffect"),
cls._compose_lof_description(f.col("Verdict")).alias("variantDescription"),
)
# Convert assessments to normalised scores:
.withColumn("inSilicoPredictors", InSilicoPredictorNormaliser.normalise_in_silico_predictors(f.col("inSilicoPredictors")))
.withColumn("variantEffect", VariantEffectNormaliser.normalise_variant_effect(f.col("variantEffect")))
),
_schema=VariantIndex.get_schema(),
)
2 changes: 1 addition & 1 deletion tests/gentropy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def mock_variant_index(spark: SparkSession) -> VariantIndex:
# https://github.com/databrickslabs/dbldatagen/issues/135
# It's a workaround for nested column handling in dbldatagen.
.withColumnSpec(
"inSilicoPredictors",
"variantEffect",
expr="""
array(
named_struct(
Expand Down
Loading

0 comments on commit 8383ba5

Please sign in to comment.