From 296ad367e6c661bbfa57f3547a8e8f0e66e66218 Mon Sep 17 00:00:00 2001 From: Xiaoyu Chen Date: Wed, 7 Nov 2018 14:25:17 -0800 Subject: [PATCH] MANTA-1398 Address review comments --- CHANGELOG.md | 3 ++- .../SVCandidateAssemblyRefiner.cpp | 4 ++-- .../SVScorePairAltProcessor.cpp | 4 ++-- src/c++/lib/blt_util/align_path.cpp | 4 ++-- src/c++/lib/blt_util/align_path.hh | 4 ++-- src/c++/lib/blt_util/test/align_path_test.cpp | 2 +- src/c++/lib/htsapi/bam_record_util.cpp | 14 +++++++------- src/c++/lib/htsapi/bam_record_util.hh | 8 -------- src/c++/lib/manta/SVCandidateAssembler.cpp | 2 +- src/c++/lib/manta/SVLocusScanner.cpp | 4 ++-- src/c++/lib/manta/SVLocusScannerSemiAligned.cpp | 4 ++-- src/c++/lib/manta/SVLocusScannerSemiAligned.hh | 10 +++++----- src/c++/lib/options/ReadScannerOptions.hh | 2 +- src/c++/lib/options/ReadScannerOptionsParser.cpp | 2 +- src/c++/lib/test/testSVLocusScanner.cpp | 2 +- src/python/bin/configManta.py.ini | 2 +- src/python/lib/mantaWorkflow.py | 6 +++--- 17 files changed, 35 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d01333e7..53fa35a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ ## Unreleased ### Added -- Add a configurable option to allow overlapping reads considered as evidence (MANTA-1398) +- Add a configurable option to allow overlapping pairs to be used as evidence (MANTA-1398) + - The option is available in the configure file configureManta.py.ini ## v1.4.0 - 2018-04-25 diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp index cd98a385..18461d22 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp @@ -130,7 +130,7 @@ isLowQualitySpanningSVAlignment( apath_limit_ref_length(maxQCRefSpan,apath); const unsigned readSize(apath_read_length(apath)); - const unsigned clipSize(apath_soft_clip_trail_size(apath)); + const unsigned clipSize(apath_soft_clip_right_size(apath)); assert(clipSize <= readSize); @@ -393,7 +393,7 @@ isLowQualitySmallSVAlignment( } const unsigned pathSize(apath_read_length(apath)); - const unsigned clipSize(apath_soft_clip_trail_size(apath)); + const unsigned clipSize(apath_soft_clip_right_size(apath)); assert(clipSize <= pathSize); diff --git a/src/c++/lib/applications/GenerateSVCandidates/SVScorePairAltProcessor.cpp b/src/c++/lib/applications/GenerateSVCandidates/SVScorePairAltProcessor.cpp index eaccb300..1b00d2bd 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/SVScorePairAltProcessor.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/SVScorePairAltProcessor.cpp @@ -265,11 +265,11 @@ realignPairedRead( { if (isLeftOfInsert) { - clipSize=apath_soft_clip_trail_size(readPath); + clipSize=apath_soft_clip_right_size(readPath); } else { - clipSize=apath_soft_clip_lead_size(readPath); + clipSize=apath_soft_clip_left_size(readPath); } } diff --git a/src/c++/lib/blt_util/align_path.cpp b/src/c++/lib/blt_util/align_path.cpp index a5bda973..b0b09857 100644 --- a/src/c++/lib/blt_util/align_path.cpp +++ b/src/c++/lib/blt_util/align_path.cpp @@ -217,7 +217,7 @@ unalignedSuffixSize(const path_t& apath) unsigned -apath_soft_clip_lead_size(const path_t& apath) +apath_soft_clip_left_size(const path_t& apath) { unsigned val(0); for (const path_segment& ps : apath) @@ -241,7 +241,7 @@ apath_soft_clip_lead_size(const path_t& apath) unsigned -apath_soft_clip_trail_size(const path_t& apath) +apath_soft_clip_right_size(const path_t& apath) { unsigned val(0); BOOST_REVERSE_FOREACH(const path_segment& ps, apath) diff --git a/src/c++/lib/blt_util/align_path.hh b/src/c++/lib/blt_util/align_path.hh index 736bbacc..9fc143ab 100644 --- a/src/c++/lib/blt_util/align_path.hh +++ b/src/c++/lib/blt_util/align_path.hh @@ -249,11 +249,11 @@ unalignedSuffixSize(const path_t& apath); /// how much soft_clip occurs before the first aligned base? unsigned -apath_soft_clip_lead_size(const path_t& apath); +apath_soft_clip_left_size(const path_t& apath); /// how much soft_clip occurs after the last aligned base? unsigned -apath_soft_clip_trail_size(const path_t& apath); +apath_soft_clip_right_size(const path_t& apath); /// how much clip (soft or hard) occurs before the first aligned base? unsigned diff --git a/src/c++/lib/blt_util/test/align_path_test.cpp b/src/c++/lib/blt_util/test/align_path_test.cpp index 16da16cd..33bcfc43 100644 --- a/src/c++/lib/blt_util/test/align_path_test.cpp +++ b/src/c++/lib/blt_util/test/align_path_test.cpp @@ -219,7 +219,7 @@ BOOST_AUTO_TEST_CASE( test_apath_clip_trail ) ALIGNPATH::path_t path; cigar_to_apath(testCigar.c_str(), path); - BOOST_REQUIRE_EQUAL(apath_soft_clip_trail_size(path),3u); + BOOST_REQUIRE_EQUAL(apath_soft_clip_right_size(path),3u); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/c++/lib/htsapi/bam_record_util.cpp b/src/c++/lib/htsapi/bam_record_util.cpp index 74b964be..0b1ad791 100644 --- a/src/c++/lib/htsapi/bam_record_util.cpp +++ b/src/c++/lib/htsapi/bam_record_util.cpp @@ -84,14 +84,14 @@ is_adapter_pair( const SimpleAlignment mate(getKnownOrFakedMateAlignment(bamRead)); if (aln.is_fwd_strand) { - unsigned const endpos = aln.pos + apath_ref_length(aln.path) + apath_soft_clip_trail_size(aln.path); - unsigned const mateStartPos = mate.pos + apath_ref_length(mate.path) + apath_soft_clip_trail_size(mate.path); + unsigned const endpos = aln.pos + apath_ref_length(aln.path) + apath_soft_clip_right_size(aln.path); + unsigned const mateStartPos = mate.pos + apath_ref_length(mate.path) + apath_soft_clip_right_size(mate.path); return (endpos > mateStartPos); } else { - unsigned const endpos = aln.pos - apath_soft_clip_lead_size(aln.path); - unsigned const mateStartPos = mate.pos - apath_soft_clip_lead_size(mate.path); + unsigned const endpos = aln.pos - apath_soft_clip_left_size(aln.path); + unsigned const mateStartPos = mate.pos - apath_soft_clip_left_size(mate.path); return (endpos < mateStartPos); } } @@ -100,8 +100,8 @@ is_adapter_pair( // If we do not have mate cigar information use an aggressive heuristic: // if the read contains soft clip on the 3' end, it likely runs into adapter. unsigned const softClipSize(aln.is_fwd_strand ? - apath_soft_clip_trail_size(aln.path) : - apath_soft_clip_lead_size(aln.path)); + apath_soft_clip_right_size(aln.path) : + apath_soft_clip_left_size(aln.path)); return (softClipSize > 0); } } @@ -116,7 +116,7 @@ is_overlapping_pair( if (! is_mapped_chrom_pair(bamRead)) return false; if (bamRead.is_fwd_strand() == bamRead.is_mate_fwd_strand()) return false; - static const int reverseOrientDist(bamRead.read_size()); + const int reverseOrientDist(bamRead.read_size()); int posDiff(bamRead.pos() - bamRead.mate_pos()); if (! bamRead.is_fwd_strand()) { diff --git a/src/c++/lib/htsapi/bam_record_util.hh b/src/c++/lib/htsapi/bam_record_util.hh index 18272ecf..8fda64e6 100644 --- a/src/c++/lib/htsapi/bam_record_util.hh +++ b/src/c++/lib/htsapi/bam_record_util.hh @@ -53,14 +53,6 @@ bool is_innie_pair( const bam_record& bam_read); -/// Rough heuristic to detect cases where paired-end reads overlap in such a way as to suggest -/// a possible unfiltered read into adapter sequence (assuming innie pairs). -/// Used as an aggressive check for candidate/hypothesis generation to avoid large number of -/// spurious small indels for short insert data that are derived from adapter k-mers -bool -is_possible_adapter_pair( - const bam_record& bamRead); - /// Based on pair alignment information (MC tag) if available, detect cases where a read extends /// past the beginning of its mate, suggesting the presence of unfiltered adapter sequence /// If MC information is not available, use a more aggressive heuristic (any 3' softclipping) diff --git a/src/c++/lib/manta/SVCandidateAssembler.cpp b/src/c++/lib/manta/SVCandidateAssembler.cpp index 4f56c0a4..5eab27b8 100644 --- a/src/c++/lib/manta/SVCandidateAssembler.cpp +++ b/src/c++/lib/manta/SVCandidateAssembler.cpp @@ -560,7 +560,7 @@ getBreakendReads( unsigned leadingMismatchLen(0); unsigned trailingMismatchLen(0); getSVBreakendCandidateSemiAlignedSimple(bamRead, bamAlign, refSeq, - _scanOpt.isUseOverlappingPair, + _scanOpt.useOverlapPairEvidence, leadingMismatchLen, trailingMismatchLen); if (isSearchForRightOpen) diff --git a/src/c++/lib/manta/SVLocusScanner.cpp b/src/c++/lib/manta/SVLocusScanner.cpp index 3459c58d..9fcb2e49 100644 --- a/src/c++/lib/manta/SVLocusScanner.cpp +++ b/src/c++/lib/manta/SVLocusScanner.cpp @@ -541,7 +541,7 @@ getSVCandidatesFromSemiAligned( unsigned trailingMismatchLen(0); pos_t leadingRefPos(0), trailingRefPos(0); getSVBreakendCandidateSemiAligned(bamRead, bamAlign, refSeq, - opt.isUseOverlappingPair, + opt.useOverlapPairEvidence, leadingMismatchLen, leadingRefPos, trailingMismatchLen, trailingRefPos); @@ -1609,7 +1609,7 @@ isSemiAlignedEvidence( { unsigned leadingMismatchLen(0), trailingMismatchLen(0); getSVBreakendCandidateSemiAlignedSimple(bamRead, bamAlign, refSeq, - _opt.isUseOverlappingPair, + _opt.useOverlapPairEvidence, leadingMismatchLen, trailingMismatchLen); return ((leadingMismatchLen >= _opt.minSemiAlignedMismatchLen) || (trailingMismatchLen >= _opt.minSemiAlignedMismatchLen)); } diff --git a/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp b/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp index c1ad750c..f2cce548 100644 --- a/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp +++ b/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp @@ -226,7 +226,7 @@ getSVBreakendCandidateSemiAligned( const bam_record& bamRead, const SimpleAlignment& bamAlign, const reference_contig_segment& refSeq, - const bool isUseOverlappingPair, + const bool useOverlapPairEvidence, unsigned& leadingEdgePoorAlignmentLength, pos_t& leadingEdgeRefPos, unsigned& trailingEdgePoorAlignmentLength, @@ -244,7 +244,7 @@ getSVBreakendCandidateSemiAligned( const bool isOverlappingReadPair(is_overlapping_pair(bamRead, bamAlign)); if (isOverlappingReadPair) { - if ((! isUseOverlappingPair) || (is_adapter_pair(bamRead))) return; + if ((! useOverlapPairEvidence) || (is_adapter_pair(bamRead))) return; } using namespace ALIGNPATH; diff --git a/src/c++/lib/manta/SVLocusScannerSemiAligned.hh b/src/c++/lib/manta/SVLocusScannerSemiAligned.hh index 882d53be..879422eb 100644 --- a/src/c++/lib/manta/SVLocusScannerSemiAligned.hh +++ b/src/c++/lib/manta/SVLocusScannerSemiAligned.hh @@ -41,7 +41,7 @@ /// /// To do so the following steps are taken: /// 1. Reads where there is suspected adaptor 'read-through' due to short fragments size are filtered out. -/// 2. Reads which overlap their mate read are optionally filtered out (per function argument \p isUseOverlappingPair). +/// 2. Reads which overlap their mate read are optionally filtered out (per function argument \p useOverlapPairEvidence). /// 3. Any soft-clipped segments in the input reed are 'unrolled' to the match state. /// 4. The length of poorly aligned sequence on each read edge is found. /// 5. Cases where the entire read is poorly aligned are filtered out. @@ -69,7 +69,7 @@ /// \param[in] bamAlign A simplified representation of the bamRead's alignment. This information can be derived from /// bamRead but is provided as a (presumed) cache optimization. /// \param[in] refSeq Local reference sequence -/// \param[in] isUseOverlappingPair When false, filter out read pairs which overlap. +/// \param[in] useOverlapPairEvidence When false, filter out read pairs which overlap. /// \param[in] isAggressiveAdaptorCheck When true, filter out read pairs that might run into adapters based on an aggressive check /// \param[out] leadingEdgePoorAlignmentLength Length of possible breakend-associated poor alignment on the read's leading edge. /// \param[out] leadingEdgeRefPos The reference position of the first aligned base after the leading poorly aligned segment. @@ -85,7 +85,7 @@ getSVBreakendCandidateSemiAligned( const bam_record& bamRead, const SimpleAlignment& bamAlign, const reference_contig_segment& refSeq, - const bool isUseOverlappingPair, + const bool useOverlapPairEvidence, unsigned& leadingEdgePoorAlignmentLength, pos_t& leadingEdgeRefPos, unsigned& trailingEdgePoorAlignmentLength, @@ -104,14 +104,14 @@ getSVBreakendCandidateSemiAlignedSimple( const bam_record& bamRead, const SimpleAlignment& bamAlign, const reference_contig_segment& refSeq, - const bool isUseOverlappingPair, + const bool useOverlapPairEvidence, unsigned& leadingMismatchLen, unsigned& trailingMismatchLen) { pos_t leadingRefPos(0), trailingRefPos(0); getSVBreakendCandidateSemiAligned( bamRead, bamAlign, refSeq, - isUseOverlappingPair, + useOverlapPairEvidence, leadingMismatchLen, leadingRefPos, trailingMismatchLen, trailingRefPos); } diff --git a/src/c++/lib/options/ReadScannerOptions.hh b/src/c++/lib/options/ReadScannerOptions.hh index cad2a862..c1fff31a 100644 --- a/src/c++/lib/options/ReadScannerOptions.hh +++ b/src/c++/lib/options/ReadScannerOptions.hh @@ -92,7 +92,7 @@ struct ReadScannerOptions unsigned minSingletonMapqCandidates = 15; // \brief If true, consider an overlapping read pair as evidence. - bool isUseOverlappingPair = false; + bool useOverlapPairEvidence = false; /// \brief If true, do not treat reads with the 'proper pair' bit set as SV evidence. /// diff --git a/src/c++/lib/options/ReadScannerOptionsParser.cpp b/src/c++/lib/options/ReadScannerOptionsParser.cpp index 5c13aa0b..7e5434d0 100644 --- a/src/c++/lib/options/ReadScannerOptionsParser.cpp +++ b/src/c++/lib/options/ReadScannerOptionsParser.cpp @@ -32,7 +32,7 @@ getOptionsDescription(ReadScannerOptions& opt) "Reads with MAPQ less than this value will be ignored") ("edge-prob", po::value(&opt.breakendEdgeQuantileProb)->default_value(opt.breakendEdgeQuantileProb), "Breakend range associated with each read will trimmed to expected fragment quantile range [p,(1-p)], p: edge-prob") - ("use-overlapping-pair", po::value(&opt.isUseOverlappingPair)->zero_tokens(), + ("use-overlapping-pair", po::value(&opt.useOverlapPairEvidence)->zero_tokens(), "Consider an overlapping read pair as evidence") ("ignore-anom-proper-pair", po::value(&opt.isIgnoreAnomProperPair)->zero_tokens(), "Disregard anomalous fragment sizes if the BAM record has the proper pair bit set. " diff --git a/src/c++/lib/test/testSVLocusScanner.cpp b/src/c++/lib/test/testSVLocusScanner.cpp index af32d793..7f050ac4 100644 --- a/src/c++/lib/test/testSVLocusScanner.cpp +++ b/src/c++/lib/test/testSVLocusScanner.cpp @@ -38,7 +38,7 @@ buildTestSVLocusScanner( { ReadScannerOptions opts = ReadScannerOptions(); opts.minCandidateVariantSize = minCandidateVariantSizeInput; - opts.isUseOverlappingPair = isRNA; + opts.useOverlapPairEvidence = isRNA; const ReadScannerOptions& constRefOpts(opts); TestStatsFileMaker statsFile; diff --git a/src/python/bin/configManta.py.ini b/src/python/bin/configManta.py.ini index a21ceb27..8e5a03b4 100644 --- a/src/python/bin/configManta.py.ini +++ b/src/python/bin/configManta.py.ini @@ -55,4 +55,4 @@ enableRemoteReadRetrievalForInsertionsInCancerCallingModes = 0 # Set if an overlapping read pair will be considered as evidence # Set to 0 to skip overlapping read pairs -isUseOverlappingPair = 0 +useOverlapPairEvidence = 0 diff --git a/src/python/lib/mantaWorkflow.py b/src/python/lib/mantaWorkflow.py index a4cfc9d3..e0dc9972 100644 --- a/src/python/lib/mantaWorkflow.py +++ b/src/python/lib/mantaWorkflow.py @@ -581,7 +581,7 @@ def isEnableRemoteReadRetrieval() : if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") - if self.params.isUseOverlappingPair: + if self.params.useOverlapPairEvidence: hygenCmd.append("--use-overlapping-pair") if self.params.isRNA : @@ -807,7 +807,7 @@ def __init__(self,params) : # normalize boolean option input: safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInGermlineCallingModes") safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInCancerCallingModes") - safeSetBool(self.params,"isUseOverlappingPair") + safeSetBool(self.params,"useOverlapPairEvidence") # Use RNA option for minCandidate size if self.params.isRNA: @@ -857,7 +857,7 @@ def __init__(self,params) : # always use overlapping pairs for RNA calling if (self.params.isRNA) : - self.params.isUseOverlappingPair = True + self.params.useOverlapPairEvidence = True def setCallMemMb(self) :