Skip to content

Commit

Permalink
Merge pull request #51 from stjude/fix
Browse files Browse the repository at this point in the history
v.3.3.6
  • Loading branch information
rawagiha authored Dec 27, 2024
2 parents bb86cb1 + ece59dc commit b6f7450
Show file tree
Hide file tree
Showing 9 changed files with 149 additions and 22 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ Leverage all resources for best performance.
* ```--include-all-external-calls``` set to include all indels in VCF file supplied by -v. (default: False. Use only calls with PASS in FILTER)
* ```--skip-homopolyer-outlier-analysis``` no outlier analysis for homopolymer indels (repeat > 4) performed if set. (default: False)
* ```--safety-mode``` deactivate parallelism at realignment step. may be required to run with -p > 1 on some platforms. (default: False)
* ```--deactivate-sensitive-mode``` deactivate additional realignments for soft-clipped reads. (default: False)

</p></details>

Expand Down
32 changes: 25 additions & 7 deletions rnaindel/analysis/alignment_feature_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import random
from indelpost import Variant, VariantAlignment

from .timeout import timeout
from .utils import most_common, split, flatten_list_of_list
from .sequence_properties import (
repeat,
Expand Down Expand Up @@ -45,7 +46,9 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000):
df["is_near_boundary"],
df["equivalence_exists"],
df["is_multiallelic"],
df["cplx_variant"],
df["cpos"],
df["cref"],
df["calt"],
) = zip(
*df.apply(
_wrapper,
Expand All @@ -63,6 +66,7 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000):

def _wrapper(row, bam, mapq, downsample_threshold):
variant = row["indel"]

(
indel_size,
is_ins,
Expand Down Expand Up @@ -92,7 +96,9 @@ def _wrapper(row, bam, mapq, downsample_threshold):
is_near_exon_boundaray,
equivalent_exists,
is_multiallelic,
cplx_variant,
cpos,
cref,
calt,
) = (
-1,
-1,
Expand All @@ -113,11 +119,16 @@ def _wrapper(row, bam, mapq, downsample_threshold):
-1,
-1,
-1,
variant,
-1,
-1,
-1,
)

if "N" not in variant.alt and "N" not in variant.ref:
res = make_indel_alignment(variant, bam, downsample_threshold)
try:
res = make_indel_alignment(variant, bam, downsample_threshold)
except:
res = None
else:
res = None

Expand All @@ -135,7 +146,9 @@ def _wrapper(row, bam, mapq, downsample_threshold):
loc_strength,
dissim,
indel_complexity,
cplx_variant,
cpos,
cref,
calt,
) = sequence_features(variant, valn, contig)
(
ref_cnt,
Expand Down Expand Up @@ -181,7 +194,9 @@ def _wrapper(row, bam, mapq, downsample_threshold):
is_near_exon_boundaray,
equivalent_exists,
is_multiallelic,
cplx_variant,
cpos,
cref,
calt,
)


Expand All @@ -206,6 +221,7 @@ def indel_type_features(variant):
return indel_size, is_ins, is_at_ins, is_at_del, is_gc_ins, is_gc_del


@timeout(2)
def make_indel_alignment(variant, bam, downsample_threshold=1500):

try:
Expand Down Expand Up @@ -352,7 +368,9 @@ def sequence_features(target_indel, valn, contig):
loc_strength,
dissim,
indel_complexity,
cplx_var,
cplx_var.pos,
cplx_var.ref,
cplx_var.alt,
)


Expand Down
41 changes: 38 additions & 3 deletions rnaindel/analysis/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys
import tempfile
import argparse
import pandas as pd
from functools import partial

from rnaindel.defaultcaller.defaultcaller import callindel
Expand Down Expand Up @@ -35,11 +36,14 @@ def analyze(subcommand, version=None):

n_processes = 1 if region else args.process_num

check_cosmic(data_dir)

with tempfile.TemporaryDirectory() as tmp_dir:
callindel(bam, fasta, tmp_dir, args.heap_memory, region, n_processes)
realn_softclips(
bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode
)
if not args.deactivate_sensitive_mode:
realn_softclips(
bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode
)

df = preprocess(
tmp_dir,
Expand Down Expand Up @@ -67,6 +71,28 @@ def analyze(subcommand, version=None):
write_vcf(df, version, args, tdna, ndna)


def check_cosmic(data_dir):
path = os.path.join(data_dir, "cosmic")

vcf_gz_ok = False
gz_tbi_ok = False
for _ in os.listdir(path):
if not vcf_gz_ok and _.endswith(".vcf.gz"):
vcf_gz_ok = True

if not gz_tbi_ok and _.endswith(".gz.tbi"):
gz_tbi_ok = True

if vcf_gz_ok and gz_tbi_ok:
pass
else:
print(
"ERROR: check COSMIC VCF file and index under {}/cosmic".format(data_dir),
file=sys.stderr,
)
sys.exit(1)


def get_args(subcommand):
prog = "rnaindel " + subcommand
parser = argparse.ArgumentParser(prog=prog)
Expand Down Expand Up @@ -199,6 +225,15 @@ def get_args(subcommand):
)

parser.set_defaults(safety_mode=False)

parser.add_argument(
"--deactivate-sensitive-mode",
dest="deactivate_sensitive_mode",
action="store_true",
help="deactivate additional realignments around softclips (experimental feature)",
)

parser.set_defaults(deactivate_sensitive_mode=False)
else:
parser.add_argument(
"-o",
Expand Down
2 changes: 1 addition & 1 deletion rnaindel/analysis/postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def postprocess(df, data_dir, perform_outlier_analysis, pon):
if perform_outlier_analysis:
df = outlier_analysis(df, os.path.join(data_dir, "outliers"))

df["cpos"], df["cref"], df["calt"] = zip(*df.apply(expand_complex, axis=1))
# df["cpos"], df["cref"], df["calt"] = zip(*df.apply(expand_complex, axis=1))

dfg = df.groupby(["chrom", "cpos", "cref", "calt"])
df = dfg.apply(recheck_caller_origin_by_complex_representation)
Expand Down
54 changes: 47 additions & 7 deletions rnaindel/analysis/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,58 @@ def preprocess(
)
for call_set_by_chrom in callsets_by_chrom
]

df = pd.concat(dfs)
else:
pool = Pool(num_of_processes)

dfs = pool.map(
# workaround for higher coverage datasets (eg., cell-free RNA-Seq)
_data_files = pool.map(
partial(
calculate_features,
fasta_file=fasta_file,
bam_file=bam_file,
data_dir=data_dir,
mapq=mapq,
external_vcf=external_vcf,
out_as_df=False,
),
callsets_by_chrom,
)

pool.close()
pool.join()

df = pd.concat(dfs)
dfs = []
for _ in _data_files:
if ".done.txt" in _:
dfs.append(
pd.read_csv(
_,
sep="\t",
dtype={
"is_in_cdd": bool,
"is_ins": bool,
"is_bidirectional": bool,
},
)
)

df = pd.concat(dfs)

fasta = pysam.FastaFile(fasta_file)
df["indel"] = df.apply(_remake, fasta=fasta, axis=1)

return df


def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_vcf):
def _remake(row, fasta):
return Variant(row["chrom"], row["pos"], row["ref"], row["alt"], fasta)


def calculate_features(
callset, fasta_file, bam_file, data_dir, mapq, external_vcf, out_as_df=True
):
path_to_coding_gene_db = "{}/refgene/refCodingExon.bed.gz".format(data_dir)
path_to_proteindb = "{}/protein/proteinConservedDomains.txt".format(data_dir)
path_to_dbsnp = "{}/dbsnp/dbsnp.indel.vcf.gz".format(data_dir)
Expand All @@ -84,15 +112,25 @@ def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_v
callset, fasta_file, path_to_coding_gene_db, external_vcf
)

_df_filename = callset.replace(".txt", ".failed.txt")
if len(df) > 0:
df = transcript_features(df, path_to_proteindb)
if len(df) > 0:
df = alignment_features(df, bam_file, mapq)

if len(df) > 0:
return database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic)

return make_empty_df()
df = database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic)
if out_as_df:
return df
else:
_df_filename = _df_filename.replace(".failed.txt", ".done.txt")
df.to_csv(_df_filename, sep="\t", index=False)
return _df_filename

if out_as_df:
return make_empty_df()
else:
return _df_filename


def filter_non_coding_indels(callset, fasta_file, path_to_coding_gene_db, external_vcf):
Expand Down Expand Up @@ -221,7 +259,9 @@ def make_empty_df():
"is_near_boundary",
"equivalence_exists",
"is_multiallelic",
"cplx_variant",
"cpos",
"cref",
"calt",
"dbsnp",
"pop_freq",
"is_common",
Expand Down
31 changes: 31 additions & 0 deletions rnaindel/analysis/timeout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import errno
import os
import signal
import functools

# taken from:
# https://stackoverflow.com/questions/2281850/timeout-function-if-it-takes-too-long-to-finish


class TimeoutError(Exception):
pass


def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
def decorator(func):
def _handle_timeout(signum, frame):
raise TimeoutError(error_message)

@functools.wraps(func)
def wrapper(*args, **kwargs):
signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(seconds)
try:
result = func(*args, **kwargs)
finally:
signal.alarm(0)
return result

return wrapper

return decorator
4 changes: 2 additions & 2 deletions rnaindel/analysis/transcript_feature_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _wrapper(row, domain_dict):
0,
0,
0,
0,
False,
"",
)

Expand Down Expand Up @@ -162,7 +162,7 @@ def in_conserved_domain_is_most_common(coding_indel_isoforms, domain_dict):
def is_within(isoform, domain_dict):
domain_position_lst = domain_dict.get(isoform.accession, [])
if not domain_position_lst:
return 0
return False

codon_pos = isoform.codon_pos
domain_position_lst.append(codon_pos)
Expand Down
4 changes: 3 additions & 1 deletion rnaindel/analysis/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def get_cmd_line(args):
"--normal--dna {} --region {} --pon {} "
"--include-all-external-calls {} "
"--skip-homopolyer-outlier-analysis {} "
"--safety-mode {}".format(
"--safety-mode {} "
"--deactivate-sensitive-mode {}".format(
os.path.abspath(args.bam),
os.path.abspath(args.reference),
os.path.abspath(args.output_vcf),
Expand All @@ -77,6 +78,7 @@ def get_cmd_line(args):
(not args.pass_only),
(not args.perform_outlier_analysis),
(args.safety_mode),
(args.deactivate_sensitive_mode),
)
)

Expand Down
2 changes: 1 addition & 1 deletion rnaindel/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.3.5"
__version__ = "3.3.6"

0 comments on commit b6f7450

Please sign in to comment.