diff --git a/CHANGELOG.md b/CHANGELOG.md index 45a20385..d01333e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## Unreleased + +### Added +- Add a configurable option to allow overlapping reads considered as evidence (MANTA-1398) + ## v1.4.0 - 2018-04-25 This is a major bugfix update from v1.3.2, featuring improved precision and vcf representation, in addition to minor user friendly improvements. diff --git a/src/c++/lib/applications/GenerateSVCandidates/GenerateSVCandidates.cpp b/src/c++/lib/applications/GenerateSVCandidates/GenerateSVCandidates.cpp index e60a9ebc..d2f9d707 100644 --- a/src/c++/lib/applications/GenerateSVCandidates/GenerateSVCandidates.cpp +++ b/src/c++/lib/applications/GenerateSVCandidates/GenerateSVCandidates.cpp @@ -188,7 +188,7 @@ runGSC( EdgeRuntimeTracker edgeTracker(opt.edgeRuntimeFilename); GSCEdgeStatsManager edgeStatMan(opt.edgeStatsFilename); - const SVLocusScanner readScanner(opt.scanOpt, opt.statsFilename, opt.alignFileOpt.alignmentFilenames, opt.isRNA, !opt.isUnstrandedRNA); + const SVLocusScanner readScanner(opt.scanOpt, opt.statsFilename, opt.alignFileOpt.alignmentFilenames, !opt.isUnstrandedRNA); SVFinder svFind(opt, readScanner, edgeTracker,edgeStatMan); MultiJunctionFilter svMJFilter(opt,edgeStatMan); diff --git a/src/c++/lib/htsapi/bam_record_util.cpp b/src/c++/lib/htsapi/bam_record_util.cpp index 21f2b94d..74b964be 100644 --- a/src/c++/lib/htsapi/bam_record_util.cpp +++ b/src/c++/lib/htsapi/bam_record_util.cpp @@ -85,12 +85,12 @@ is_adapter_pair( if (aln.is_fwd_strand) { unsigned const endpos = aln.pos + apath_ref_length(aln.path) + apath_soft_clip_trail_size(aln.path); - unsigned const mateStartPos = mate.pos + apath_ref_length(mate.path) + apath_soft_clip_lead_size(mate.path); + unsigned const mateStartPos = mate.pos + apath_ref_length(mate.path) + apath_soft_clip_trail_size(mate.path); return (endpos > mateStartPos); } else { - unsigned const endpos = aln.pos - apath_soft_clip_trail_size(aln.path); + unsigned const endpos = aln.pos - apath_soft_clip_lead_size(aln.path); unsigned const mateStartPos = mate.pos - apath_soft_clip_lead_size(mate.path); return (endpos < mateStartPos); } diff --git a/src/c++/lib/manta/SVCandidateAssembler.cpp b/src/c++/lib/manta/SVCandidateAssembler.cpp index 82ff5a7b..4f56c0a4 100644 --- a/src/c++/lib/manta/SVCandidateAssembler.cpp +++ b/src/c++/lib/manta/SVCandidateAssembler.cpp @@ -560,7 +560,7 @@ getBreakendReads( unsigned leadingMismatchLen(0); unsigned trailingMismatchLen(0); getSVBreakendCandidateSemiAlignedSimple(bamRead, bamAlign, refSeq, - _readScanner.isUseOverlappingPairs(), + _scanOpt.isUseOverlappingPair, leadingMismatchLen, trailingMismatchLen); if (isSearchForRightOpen) diff --git a/src/c++/lib/manta/SVLocusScanner.cpp b/src/c++/lib/manta/SVLocusScanner.cpp index 9fe64a9e..3459c58d 100644 --- a/src/c++/lib/manta/SVLocusScanner.cpp +++ b/src/c++/lib/manta/SVLocusScanner.cpp @@ -541,7 +541,7 @@ getSVCandidatesFromSemiAligned( unsigned trailingMismatchLen(0); pos_t leadingRefPos(0), trailingRefPos(0); getSVBreakendCandidateSemiAligned(bamRead, bamAlign, refSeq, - dopt.isUseOverlappingPairs, + opt.isUseOverlappingPair, leadingMismatchLen, leadingRefPos, trailingMismatchLen, trailingRefPos); @@ -1429,10 +1429,9 @@ SVLocusScanner( const ReadScannerOptions& opt, const std::string& statsFilename, const std::vector& /*alignmentFilename*/, - const bool isRNA, const bool isTranscriptStrandKnown) : _opt(opt), - _dopt(opt, isRNA, isTranscriptStrandKnown) + _dopt(opt, isTranscriptStrandKnown) { using namespace illumina::common; @@ -1610,7 +1609,7 @@ isSemiAlignedEvidence( { unsigned leadingMismatchLen(0), trailingMismatchLen(0); getSVBreakendCandidateSemiAlignedSimple(bamRead, bamAlign, refSeq, - _dopt.isUseOverlappingPairs, + _opt.isUseOverlappingPair, leadingMismatchLen, trailingMismatchLen); return ((leadingMismatchLen >= _opt.minSemiAlignedMismatchLen) || (trailingMismatchLen >= _opt.minSemiAlignedMismatchLen)); } diff --git a/src/c++/lib/manta/SVLocusScanner.hh b/src/c++/lib/manta/SVLocusScanner.hh index 2556b1c7..28fda1ea 100644 --- a/src/c++/lib/manta/SVLocusScanner.hh +++ b/src/c++/lib/manta/SVLocusScanner.hh @@ -83,12 +83,10 @@ struct ReadScannerDerivOptions { ReadScannerDerivOptions( const ReadScannerOptions& opt, - const bool isRNA, const bool initIsTranscriptStrandKnown) : isSmallCandidates(opt.minCandidateVariantSize<=opt.maxCandidateSizeForLocalAssmEvidence), beforeBreakend(opt.minPairBreakendSize/2), afterBreakend(opt.minPairBreakendSize-beforeBreakend), - isUseOverlappingPairs(isRNA), isTranscriptStrandKnown(initIsTranscriptStrandKnown) {} @@ -96,10 +94,6 @@ struct ReadScannerDerivOptions const pos_t beforeBreakend; const pos_t afterBreakend; - /// TODO standardize the overlapping pair treatment to be the same for DNA/RNA modes, then - /// remove this bit: - const bool isUseOverlappingPairs; - /// \brief True if running in RNA-Seq mode with a stranded input RNA assay const bool isTranscriptStrandKnown; }; @@ -119,7 +113,6 @@ struct SVLocusScanner const ReadScannerOptions& opt, const std::string& statsFilename, const std::vector& alignmentFilename, - const bool isRNA, const bool isTranscriptStrandKnown = false); /// QC check if the read length implied by cigar string matches the length of read sequence @@ -317,12 +310,6 @@ struct SVLocusScanner LinearScaler largeEventRegionScaler; }; - bool - isUseOverlappingPairs() const - { - return _dopt.isUseOverlappingPairs; - } - private: /// \brief Classify read pair fragment sizes into a pre-defined size category. diff --git a/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp b/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp index 3baf9e02..c1ad750c 100644 --- a/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp +++ b/src/c++/lib/manta/SVLocusScannerSemiAligned.cpp @@ -226,7 +226,7 @@ getSVBreakendCandidateSemiAligned( const bam_record& bamRead, const SimpleAlignment& bamAlign, const reference_contig_segment& refSeq, - const bool isUseOverlappingPairs, + const bool isUseOverlappingPair, unsigned& leadingEdgePoorAlignmentLength, pos_t& leadingEdgeRefPos, unsigned& trailingEdgePoorAlignmentLength, @@ -244,7 +244,7 @@ getSVBreakendCandidateSemiAligned( const bool isOverlappingReadPair(is_overlapping_pair(bamRead, bamAlign)); if (isOverlappingReadPair) { - if ((! isUseOverlappingPairs) || (is_adapter_pair(bamRead))) return; + if ((! isUseOverlappingPair) || (is_adapter_pair(bamRead))) return; } using namespace ALIGNPATH; diff --git a/src/c++/lib/manta/SVLocusScannerSemiAligned.hh b/src/c++/lib/manta/SVLocusScannerSemiAligned.hh index cd86ca93..882d53be 100644 --- a/src/c++/lib/manta/SVLocusScannerSemiAligned.hh +++ b/src/c++/lib/manta/SVLocusScannerSemiAligned.hh @@ -41,7 +41,7 @@ /// /// To do so the following steps are taken: /// 1. Reads where there is suspected adaptor 'read-through' due to short fragments size are filtered out. -/// 2. Reads which overlap their mate read are optionally filtered out (per function argument \p isUseOverlappingPairs). +/// 2. Reads which overlap their mate read are optionally filtered out (per function argument \p isUseOverlappingPair). /// 3. Any soft-clipped segments in the input reed are 'unrolled' to the match state. /// 4. The length of poorly aligned sequence on each read edge is found. /// 5. Cases where the entire read is poorly aligned are filtered out. @@ -69,7 +69,7 @@ /// \param[in] bamAlign A simplified representation of the bamRead's alignment. This information can be derived from /// bamRead but is provided as a (presumed) cache optimization. /// \param[in] refSeq Local reference sequence -/// \param[in] isUseOverlappingPairs When false, filter out read pairs which overlap. +/// \param[in] isUseOverlappingPair When false, filter out read pairs which overlap. /// \param[in] isAggressiveAdaptorCheck When true, filter out read pairs that might run into adapters based on an aggressive check /// \param[out] leadingEdgePoorAlignmentLength Length of possible breakend-associated poor alignment on the read's leading edge. /// \param[out] leadingEdgeRefPos The reference position of the first aligned base after the leading poorly aligned segment. @@ -85,7 +85,7 @@ getSVBreakendCandidateSemiAligned( const bam_record& bamRead, const SimpleAlignment& bamAlign, const reference_contig_segment& refSeq, - const bool isUseOverlappingPairs, + const bool isUseOverlappingPair, unsigned& leadingEdgePoorAlignmentLength, pos_t& leadingEdgeRefPos, unsigned& trailingEdgePoorAlignmentLength, @@ -104,14 +104,14 @@ getSVBreakendCandidateSemiAlignedSimple( const bam_record& bamRead, const SimpleAlignment& bamAlign, const reference_contig_segment& refSeq, - const bool isUseOverlappingPairs, + const bool isUseOverlappingPair, unsigned& leadingMismatchLen, unsigned& trailingMismatchLen) { pos_t leadingRefPos(0), trailingRefPos(0); getSVBreakendCandidateSemiAligned( bamRead, bamAlign, refSeq, - isUseOverlappingPairs, + isUseOverlappingPair, leadingMismatchLen, leadingRefPos, trailingMismatchLen, trailingRefPos); } diff --git a/src/c++/lib/manta/test/SVLocusScannerTest.cpp b/src/c++/lib/manta/test/SVLocusScannerTest.cpp index f46fd703..7469729e 100644 --- a/src/c++/lib/manta/test/SVLocusScannerTest.cpp +++ b/src/c++/lib/manta/test/SVLocusScannerTest.cpp @@ -334,10 +334,9 @@ BOOST_AUTO_TEST_CASE( test_SemiAlignedReads ) BOOST_AUTO_TEST_CASE( test_getSVCandidatesFromReadIndels ) { - const bool isRNA(false); const bool isStranded(true); const ReadScannerOptions opt; - const ReadScannerDerivOptions dopt(opt,isRNA,isStranded); + const ReadScannerDerivOptions dopt(opt,isStranded); ALIGNPATH::path_t inputPath; cigar_to_apath("100M2000D100M",inputPath); diff --git a/src/c++/lib/options/ReadScannerOptions.hh b/src/c++/lib/options/ReadScannerOptions.hh index daec988a..cad2a862 100644 --- a/src/c++/lib/options/ReadScannerOptions.hh +++ b/src/c++/lib/options/ReadScannerOptions.hh @@ -91,6 +91,9 @@ struct ReadScannerOptions /// \brief Minimum mapping quality for shadow mate used for candidate assembly and scoring unsigned minSingletonMapqCandidates = 15; + // \brief If true, consider an overlapping read pair as evidence. + bool isUseOverlappingPair = false; + /// \brief If true, do not treat reads with the 'proper pair' bit set as SV evidence. /// /// This is typically set true for RNA-Seq analysis, where proper-pair is used to signal intron-spanning pairs. diff --git a/src/c++/lib/options/ReadScannerOptionsParser.cpp b/src/c++/lib/options/ReadScannerOptionsParser.cpp index 3284428f..5c13aa0b 100644 --- a/src/c++/lib/options/ReadScannerOptionsParser.cpp +++ b/src/c++/lib/options/ReadScannerOptionsParser.cpp @@ -32,6 +32,8 @@ getOptionsDescription(ReadScannerOptions& opt) "Reads with MAPQ less than this value will be ignored") ("edge-prob", po::value(&opt.breakendEdgeQuantileProb)->default_value(opt.breakendEdgeQuantileProb), "Breakend range associated with each read will trimmed to expected fragment quantile range [p,(1-p)], p: edge-prob") + ("use-overlapping-pair", po::value(&opt.isUseOverlappingPair)->zero_tokens(), + "Consider an overlapping read pair as evidence") ("ignore-anom-proper-pair", po::value(&opt.isIgnoreAnomProperPair)->zero_tokens(), "Disregard anomalous fragment sizes if the BAM record has the proper pair bit set. " "This flag is typically set for RNA-SEQ analysis where the proper-pair bit is used to indicate an intron-spanning read pair.") diff --git a/src/c++/lib/test/testSVLocusScanner.cpp b/src/c++/lib/test/testSVLocusScanner.cpp index bca2d0c8..af32d793 100644 --- a/src/c++/lib/test/testSVLocusScanner.cpp +++ b/src/c++/lib/test/testSVLocusScanner.cpp @@ -38,6 +38,7 @@ buildTestSVLocusScanner( { ReadScannerOptions opts = ReadScannerOptions(); opts.minCandidateVariantSize = minCandidateVariantSizeInput; + opts.isUseOverlappingPair = isRNA; const ReadScannerOptions& constRefOpts(opts); TestStatsFileMaker statsFile; diff --git a/src/python/bin/configManta.py.ini b/src/python/bin/configManta.py.ini index e9da525b..a21ceb27 100644 --- a/src/python/bin/configManta.py.ini +++ b/src/python/bin/configManta.py.ini @@ -52,3 +52,7 @@ minPassSomaticScore = 30 # all other calling modes. enableRemoteReadRetrievalForInsertionsInGermlineCallingModes = 1 enableRemoteReadRetrievalForInsertionsInCancerCallingModes = 0 + +# Set if an overlapping read pair will be considered as evidence +# Set to 0 to skip overlapping read pairs +isUseOverlappingPair = 0 diff --git a/src/python/lib/mantaWorkflow.py b/src/python/lib/mantaWorkflow.py index 69e7604a..a4cfc9d3 100644 --- a/src/python/lib/mantaWorkflow.py +++ b/src/python/lib/mantaWorkflow.py @@ -580,6 +580,10 @@ def isEnableRemoteReadRetrieval() : if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") + + if self.params.isUseOverlappingPair: + hygenCmd.append("--use-overlapping-pair") + if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : @@ -803,6 +807,7 @@ def __init__(self,params) : # normalize boolean option input: safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInGermlineCallingModes") safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInCancerCallingModes") + safeSetBool(self.params,"isUseOverlappingPair") # Use RNA option for minCandidate size if self.params.isRNA: @@ -850,6 +855,10 @@ def __init__(self,params) : self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA) + # always use overlapping pairs for RNA calling + if (self.params.isRNA) : + self.params.isUseOverlappingPair = True + def setCallMemMb(self) : "Setup default task memory requirements"