From ece59dc4d15450de9ce1dd2ef9da85092cfb14a7 Mon Sep 17 00:00:00 2001 From: Kohei Hagiwara Date: Fri, 27 Dec 2024 10:40:40 -0600 Subject: [PATCH] v.3.3.6 --- README.md | 1 + .../analysis/alignment_feature_calculator.py | 32 ++++++++--- rnaindel/analysis/analyzer.py | 41 ++++++++++++-- rnaindel/analysis/postprocessor.py | 2 +- rnaindel/analysis/preprocessor.py | 54 ++++++++++++++++--- rnaindel/analysis/timeout.py | 31 +++++++++++ .../analysis/transcript_feature_calculator.py | 4 +- rnaindel/analysis/vcf_writer.py | 4 +- rnaindel/version.py | 2 +- 9 files changed, 149 insertions(+), 22 deletions(-) create mode 100644 rnaindel/analysis/timeout.py diff --git a/README.md b/README.md index 5b86ece..80ff38a 100755 --- a/README.md +++ b/README.md @@ -194,6 +194,7 @@ Leverage all resources for best performance. * ```--include-all-external-calls``` set to include all indels in VCF file supplied by -v. (default: False. Use only calls with PASS in FILTER) * ```--skip-homopolyer-outlier-analysis``` no outlier analysis for homopolymer indels (repeat > 4) performed if set. (default: False) * ```--safety-mode``` deactivate parallelism at realignment step. may be required to run with -p > 1 on some platforms. (default: False) + * ```--deactivate-sensitive-mode``` deactivate additional realignments for soft-clipped reads. (default: False)

diff --git a/rnaindel/analysis/alignment_feature_calculator.py b/rnaindel/analysis/alignment_feature_calculator.py index 6191a82..45ccd0e 100755 --- a/rnaindel/analysis/alignment_feature_calculator.py +++ b/rnaindel/analysis/alignment_feature_calculator.py @@ -3,6 +3,7 @@ import random from indelpost import Variant, VariantAlignment +from .timeout import timeout from .utils import most_common, split, flatten_list_of_list from .sequence_properties import ( repeat, @@ -45,7 +46,9 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000): df["is_near_boundary"], df["equivalence_exists"], df["is_multiallelic"], - df["cplx_variant"], + df["cpos"], + df["cref"], + df["calt"], ) = zip( *df.apply( _wrapper, @@ -63,6 +66,7 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000): def _wrapper(row, bam, mapq, downsample_threshold): variant = row["indel"] + ( indel_size, is_ins, @@ -92,7 +96,9 @@ def _wrapper(row, bam, mapq, downsample_threshold): is_near_exon_boundaray, equivalent_exists, is_multiallelic, - cplx_variant, + cpos, + cref, + calt, ) = ( -1, -1, @@ -113,11 +119,16 @@ def _wrapper(row, bam, mapq, downsample_threshold): -1, -1, -1, - variant, + -1, + -1, + -1, ) if "N" not in variant.alt and "N" not in variant.ref: - res = make_indel_alignment(variant, bam, downsample_threshold) + try: + res = make_indel_alignment(variant, bam, downsample_threshold) + except: + res = None else: res = None @@ -135,7 +146,9 @@ def _wrapper(row, bam, mapq, downsample_threshold): loc_strength, dissim, indel_complexity, - cplx_variant, + cpos, + cref, + calt, ) = sequence_features(variant, valn, contig) ( ref_cnt, @@ -181,7 +194,9 @@ def _wrapper(row, bam, mapq, downsample_threshold): is_near_exon_boundaray, equivalent_exists, is_multiallelic, - cplx_variant, + cpos, + cref, + calt, ) @@ -206,6 +221,7 @@ def indel_type_features(variant): return indel_size, is_ins, is_at_ins, is_at_del, is_gc_ins, is_gc_del +@timeout(2) def make_indel_alignment(variant, bam, downsample_threshold=1500): try: @@ -352,7 +368,9 @@ def sequence_features(target_indel, valn, contig): loc_strength, dissim, indel_complexity, - cplx_var, + cplx_var.pos, + cplx_var.ref, + cplx_var.alt, ) diff --git a/rnaindel/analysis/analyzer.py b/rnaindel/analysis/analyzer.py index 97778ad..6f32e36 100755 --- a/rnaindel/analysis/analyzer.py +++ b/rnaindel/analysis/analyzer.py @@ -2,6 +2,7 @@ import sys import tempfile import argparse +import pandas as pd from functools import partial from rnaindel.defaultcaller.defaultcaller import callindel @@ -35,11 +36,14 @@ def analyze(subcommand, version=None): n_processes = 1 if region else args.process_num + check_cosmic(data_dir) + with tempfile.TemporaryDirectory() as tmp_dir: callindel(bam, fasta, tmp_dir, args.heap_memory, region, n_processes) - realn_softclips( - bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode - ) + if not args.deactivate_sensitive_mode: + realn_softclips( + bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode + ) df = preprocess( tmp_dir, @@ -67,6 +71,28 @@ def analyze(subcommand, version=None): write_vcf(df, version, args, tdna, ndna) +def check_cosmic(data_dir): + path = os.path.join(data_dir, "cosmic") + + vcf_gz_ok = False + gz_tbi_ok = False + for _ in os.listdir(path): + if not vcf_gz_ok and _.endswith(".vcf.gz"): + vcf_gz_ok = True + + if not gz_tbi_ok and _.endswith(".gz.tbi"): + gz_tbi_ok = True + + if vcf_gz_ok and gz_tbi_ok: + pass + else: + print( + "ERROR: check COSMIC VCF file and index under {}/cosmic".format(data_dir), + file=sys.stderr, + ) + sys.exit(1) + + def get_args(subcommand): prog = "rnaindel " + subcommand parser = argparse.ArgumentParser(prog=prog) @@ -199,6 +225,15 @@ def get_args(subcommand): ) parser.set_defaults(safety_mode=False) + + parser.add_argument( + "--deactivate-sensitive-mode", + dest="deactivate_sensitive_mode", + action="store_true", + help="deactivate additional realignments around softclips (experimental feature)", + ) + + parser.set_defaults(deactivate_sensitive_mode=False) else: parser.add_argument( "-o", diff --git a/rnaindel/analysis/postprocessor.py b/rnaindel/analysis/postprocessor.py index cd20ea6..7864314 100755 --- a/rnaindel/analysis/postprocessor.py +++ b/rnaindel/analysis/postprocessor.py @@ -19,7 +19,7 @@ def postprocess(df, data_dir, perform_outlier_analysis, pon): if perform_outlier_analysis: df = outlier_analysis(df, os.path.join(data_dir, "outliers")) - df["cpos"], df["cref"], df["calt"] = zip(*df.apply(expand_complex, axis=1)) + # df["cpos"], df["cref"], df["calt"] = zip(*df.apply(expand_complex, axis=1)) dfg = df.groupby(["chrom", "cpos", "cref", "calt"]) df = dfg.apply(recheck_caller_origin_by_complex_representation) diff --git a/rnaindel/analysis/preprocessor.py b/rnaindel/analysis/preprocessor.py index 3f7f58d..7c96b83 100755 --- a/rnaindel/analysis/preprocessor.py +++ b/rnaindel/analysis/preprocessor.py @@ -50,10 +50,13 @@ def preprocess( ) for call_set_by_chrom in callsets_by_chrom ] + + df = pd.concat(dfs) else: pool = Pool(num_of_processes) - dfs = pool.map( + # workaround for higher coverage datasets (eg., cell-free RNA-Seq) + _data_files = pool.map( partial( calculate_features, fasta_file=fasta_file, @@ -61,6 +64,7 @@ def preprocess( data_dir=data_dir, mapq=mapq, external_vcf=external_vcf, + out_as_df=False, ), callsets_by_chrom, ) @@ -68,12 +72,36 @@ def preprocess( pool.close() pool.join() - df = pd.concat(dfs) + dfs = [] + for _ in _data_files: + if ".done.txt" in _: + dfs.append( + pd.read_csv( + _, + sep="\t", + dtype={ + "is_in_cdd": bool, + "is_ins": bool, + "is_bidirectional": bool, + }, + ) + ) + + df = pd.concat(dfs) + + fasta = pysam.FastaFile(fasta_file) + df["indel"] = df.apply(_remake, fasta=fasta, axis=1) return df -def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_vcf): +def _remake(row, fasta): + return Variant(row["chrom"], row["pos"], row["ref"], row["alt"], fasta) + + +def calculate_features( + callset, fasta_file, bam_file, data_dir, mapq, external_vcf, out_as_df=True +): path_to_coding_gene_db = "{}/refgene/refCodingExon.bed.gz".format(data_dir) path_to_proteindb = "{}/protein/proteinConservedDomains.txt".format(data_dir) path_to_dbsnp = "{}/dbsnp/dbsnp.indel.vcf.gz".format(data_dir) @@ -84,15 +112,25 @@ def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_v callset, fasta_file, path_to_coding_gene_db, external_vcf ) + _df_filename = callset.replace(".txt", ".failed.txt") if len(df) > 0: df = transcript_features(df, path_to_proteindb) if len(df) > 0: df = alignment_features(df, bam_file, mapq) if len(df) > 0: - return database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic) - - return make_empty_df() + df = database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic) + if out_as_df: + return df + else: + _df_filename = _df_filename.replace(".failed.txt", ".done.txt") + df.to_csv(_df_filename, sep="\t", index=False) + return _df_filename + + if out_as_df: + return make_empty_df() + else: + return _df_filename def filter_non_coding_indels(callset, fasta_file, path_to_coding_gene_db, external_vcf): @@ -221,7 +259,9 @@ def make_empty_df(): "is_near_boundary", "equivalence_exists", "is_multiallelic", - "cplx_variant", + "cpos", + "cref", + "calt", "dbsnp", "pop_freq", "is_common", diff --git a/rnaindel/analysis/timeout.py b/rnaindel/analysis/timeout.py new file mode 100644 index 0000000..30365d2 --- /dev/null +++ b/rnaindel/analysis/timeout.py @@ -0,0 +1,31 @@ +import errno +import os +import signal +import functools + +# taken from: +# https://stackoverflow.com/questions/2281850/timeout-function-if-it-takes-too-long-to-finish + + +class TimeoutError(Exception): + pass + + +def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): + def decorator(func): + def _handle_timeout(signum, frame): + raise TimeoutError(error_message) + + @functools.wraps(func) + def wrapper(*args, **kwargs): + signal.signal(signal.SIGALRM, _handle_timeout) + signal.alarm(seconds) + try: + result = func(*args, **kwargs) + finally: + signal.alarm(0) + return result + + return wrapper + + return decorator diff --git a/rnaindel/analysis/transcript_feature_calculator.py b/rnaindel/analysis/transcript_feature_calculator.py index 95f3bb8..0059d00 100755 --- a/rnaindel/analysis/transcript_feature_calculator.py +++ b/rnaindel/analysis/transcript_feature_calculator.py @@ -57,7 +57,7 @@ def _wrapper(row, domain_dict): 0, 0, 0, - 0, + False, "", ) @@ -162,7 +162,7 @@ def in_conserved_domain_is_most_common(coding_indel_isoforms, domain_dict): def is_within(isoform, domain_dict): domain_position_lst = domain_dict.get(isoform.accession, []) if not domain_position_lst: - return 0 + return False codon_pos = isoform.codon_pos domain_position_lst.append(codon_pos) diff --git a/rnaindel/analysis/vcf_writer.py b/rnaindel/analysis/vcf_writer.py index 2d01b44..ce35878 100755 --- a/rnaindel/analysis/vcf_writer.py +++ b/rnaindel/analysis/vcf_writer.py @@ -61,7 +61,8 @@ def get_cmd_line(args): "--normal--dna {} --region {} --pon {} " "--include-all-external-calls {} " "--skip-homopolyer-outlier-analysis {} " - "--safety-mode {}".format( + "--safety-mode {} " + "--deactivate-sensitive-mode {}".format( os.path.abspath(args.bam), os.path.abspath(args.reference), os.path.abspath(args.output_vcf), @@ -77,6 +78,7 @@ def get_cmd_line(args): (not args.pass_only), (not args.perform_outlier_analysis), (args.safety_mode), + (args.deactivate_sensitive_mode), ) ) diff --git a/rnaindel/version.py b/rnaindel/version.py index c90ab1b..1549c12 100755 --- a/rnaindel/version.py +++ b/rnaindel/version.py @@ -1 +1 @@ -__version__ = "3.3.5" +__version__ = "3.3.6"