From 2ac0b3c3853f0593803ae12124818473329c69b7 Mon Sep 17 00:00:00 2001 From: Xiaoyu Chen Date: Fri, 8 Apr 2016 17:02:43 -0700 Subject: [PATCH] [MANTA-287] add the unit test of consistency of supporting reads --- src/c++/lib/assembly/SmallAssembler.cpp | 89 +++++++++---------- .../lib/assembly/test/SmallAssemblerTest.cpp | 52 +++++++++++ src/c++/lib/manta/SVCandidateAssembler.cpp | 2 +- src/c++/lib/svgraph/SVLocusNode.hh | 2 +- 4 files changed, 98 insertions(+), 47 deletions(-) diff --git a/src/c++/lib/assembly/SmallAssembler.cpp b/src/c++/lib/assembly/SmallAssembler.cpp index d3a5012a..00ed10a8 100644 --- a/src/c++/lib/assembly/SmallAssembler.cpp +++ b/src/c++/lib/assembly/SmallAssembler.cpp @@ -35,7 +35,7 @@ // compile with this macro to get verbose output: -#define DEBUG_ASBL +//#define DEBUG_ASBL // stream used by DEBUG_ASBL: @@ -183,32 +183,32 @@ walk(const SmallAssemblerOptions& opt, contig.seq = seed; // collecting rejecting reads for the seed from the unselected branches - for (const char symbol : opt.alphabet) - { - // the seed itself - if (symbol == seed[wordLength-1]) continue; + for (const char symbol : opt.alphabet) + { + // the seed itself + if (symbol == seed[wordLength-1]) continue; - // add rejecting reads from an unselected word/branch - const std::string tmpBack = getEnd(seed, wordLength-1, false); - const std::string newKey(addBase(tmpBack, symbol, true)); + // add rejecting reads from an unselected word/branch + const std::string tmpBack = getEnd(seed, wordLength-1, false); + const std::string newKey(addBase(tmpBack, symbol, true)); #ifdef DEBUG_ASBL - log_os << "Extending end backwords: base " << symbol << " " << newKey << "\n"; + log_os << "Extending end backwords: base " << symbol << " " << newKey << "\n"; #endif - wordReadsIter= wordReads.find(newKey); - if (wordReadsIter == wordReadsEnd) continue; - const std::set& unselectedReads(wordReadsIter->second); + wordReadsIter= wordReads.find(newKey); + if (wordReadsIter == wordReadsEnd) continue; + const std::set& unselectedReads(wordReadsIter->second); #ifdef DEBUG_ASBL - log_os << "Supporting reads for the backwards word : "; - print_readSet(unselectedReads); + log_os << "Supporting reads for the backwards word : "; + print_readSet(unselectedReads); #endif - contig.rejectReads.insert(unselectedReads.begin(), unselectedReads.end()); + contig.rejectReads.insert(unselectedReads.begin(), unselectedReads.end()); #ifdef DEBUG_ASBL - log_os << "seed's rejecting reads : "; - print_readSet(contig.rejectReads); + log_os << "seed's rejecting reads : "; + print_readSet(contig.rejectReads); #endif - } + } seenEdgeBefore.clear(); seenEdgeBefore.insert(seed); @@ -224,8 +224,8 @@ walk(const SmallAssemblerOptions& opt, while (true) { - const std::string previousWord = getEnd(contig.seq, wordLength, isEnd); - const std::string trunk(getEnd(contig.seq, wordLength-1, isEnd)); + const std::string previousWord = getEnd(contig.seq, wordLength, isEnd); + const std::string trunk(getEnd(contig.seq, wordLength-1, isEnd)); #ifdef DEBUG_ASBL log_os << "# current contig : " << contig.seq << " size : " << contig.seq.size() << "\n" @@ -345,35 +345,35 @@ walk(const SmallAssemblerOptions& opt, // TODO: can add threshold for the count or percentage of shared reads { - // walk backwards for one step at a branching point - if (maxWordReads != previousWordReads) - { - const char tmpSymbol = (isEnd? previousWord[0] : previousWord[wordLength-1]); - for (const char symbol : opt.alphabet) - { - // the selected branch - if (symbol == tmpSymbol) continue; + // walk backwards for one step at a branching point + if (maxWordReads != previousWordReads) + { + const char tmpSymbol = (isEnd? previousWord[0] : previousWord[wordLength-1]); + for (const char symbol : opt.alphabet) + { + // the selected branch + if (symbol == tmpSymbol) continue; - // add rejecting reads from an unselected branch - const std::string newKey(addBase(trunk, symbol, !isEnd)); + // add rejecting reads from an unselected branch + const std::string newKey(addBase(trunk, symbol, !isEnd)); #ifdef DEBUG_ASBL - log_os << "Extending end backwords: base " << symbol << " " << newKey << "\n"; + log_os << "Extending end backwords: base " << symbol << " " << newKey << "\n"; #endif - wordReadsIter= wordReads.find(newKey); - if (wordReadsIter == wordReadsEnd) continue; - const std::set& backWordReads(wordReadsIter->second); + wordReadsIter= wordReads.find(newKey); + if (wordReadsIter == wordReadsEnd) continue; + const std::set& backWordReads(wordReadsIter->second); #ifdef DEBUG_ASBL - log_os << "Supporting reads for the backwards word : "; - print_readSet(backWordReads); + log_os << "Supporting reads for the backwards word : "; + print_readSet(backWordReads); #endif - rejectReads2Add.insert(backWordReads.begin(), backWordReads.end()); + rejectReads2Add.insert(backWordReads.begin(), backWordReads.end()); #ifdef DEBUG_ASBL - log_os << "rejectReads2Add upated : "; - print_readSet(rejectReads2Add); + log_os << "rejectReads2Add upated : "; + print_readSet(rejectReads2Add); #endif - } - } - previousWordReads = maxWordReads; + } + } + previousWordReads = maxWordReads; #ifdef DEBUG_ASBL log_os << "Adding rejecting reads " << "\n" @@ -604,13 +604,12 @@ buildContigs( while (! maxWords.empty()) { + maxWord=(*maxWords.begin()); + maxWords.erase(maxWords.begin()); #ifdef DEBUG_ASBL log_os << logtag << "Seeding kmer : " << maxWord << "\n"; #endif - maxWord=(*maxWords.begin()); - maxWords.erase(maxWords.begin()); - AssembledContig newContig; walk(opt, maxWord, wordLength, wordCount, wordSupportReads, seenEdgeBefore, newContig); diff --git a/src/c++/lib/assembly/test/SmallAssemblerTest.cpp b/src/c++/lib/assembly/test/SmallAssemblerTest.cpp index 82d55404..ad9e75af 100644 --- a/src/c++/lib/assembly/test/SmallAssemblerTest.cpp +++ b/src/c++/lib/assembly/test/SmallAssemblerTest.cpp @@ -102,5 +102,57 @@ BOOST_AUTO_TEST_CASE( test_PoisonRead ) } + +BOOST_AUTO_TEST_CASE( test_supportingReadConsistency ) +{ + // test against observed case where a single bad read could kill the whole assembly + + SmallAssemblerOptions assembleOpt; + + assembleOpt.minWordLength = 6; + assembleOpt.maxWordLength = 6; + assembleOpt.minCoverage = 2; + assembleOpt.minSeedReads = 3; + + AssemblyReadInput reads; + reads.push_back( "AAACGTGTATTA"); + reads.push_back( "ACGTGTATTACC"); + reads.push_back( "CGTGTATTACCT"); + reads.push_back( "GTGTATTACCTA"); + reads.push_back( "ATTACCTAGTAC"); + reads.push_back( "TACCTAGTACTC"); + // the above reads build a contig ACGTG TATTACC TAGTAC + // + // Notice ACGTG should not be extended by adding 'A' to the left => AACGTG + // using the reads below, because they have a different suffix after ACGTG *GCC* + // Instead, the reads below build a contig CTTA GCTA ACGTG GCC + reads.push_back("CCCTTAGCTAAC"); + reads.push_back( "CTTAGCTAACGT"); + reads.push_back( "TAGCTAACGTGG"); + reads.push_back( "GCTAACGTGGCC"); + reads.push_back( "AACGTGGCCTAG"); + + + AssemblyReadOutput readInfo; + Assembly contigs; + + runSmallAssembler(assembleOpt, reads, readInfo, contigs); + + BOOST_REQUIRE_EQUAL(contigs.size(),2u); + BOOST_REQUIRE_EQUAL(contigs[0].seq,"AACGTGTATTACCTAGTAC"); + BOOST_REQUIRE_EQUAL(contigs[1].seq,"CTTAGCTAACGTGGCC"); + for (unsigned i(0); i<6; ++i) + { + BOOST_REQUIRE(readInfo[i].isUsed); + BOOST_REQUIRE_EQUAL(readInfo[i].contigId,0u); + } + + for (unsigned i(6); i<11; ++i) + { + BOOST_REQUIRE(readInfo[i].isUsed); + BOOST_REQUIRE_EQUAL(readInfo[i].contigId,1u); + } +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/c++/lib/manta/SVCandidateAssembler.cpp b/src/c++/lib/manta/SVCandidateAssembler.cpp index cc515d62..68b6fdc3 100644 --- a/src/c++/lib/manta/SVCandidateAssembler.cpp +++ b/src/c++/lib/manta/SVCandidateAssembler.cpp @@ -37,7 +37,7 @@ #include //#define DEBUG_REMOTES -#define DEBUG_ASBL +//#define DEBUG_ASBL static diff --git a/src/c++/lib/svgraph/SVLocusNode.hh b/src/c++/lib/svgraph/SVLocusNode.hh index ef1d4de2..17a45119 100644 --- a/src/c++/lib/svgraph/SVLocusNode.hh +++ b/src/c++/lib/svgraph/SVLocusNode.hh @@ -140,7 +140,7 @@ typedef unsigned NodeIndexType; #if 0 class customConstEdgeIterator : public boost::iterator_adaptor< - customConstEdgeIterator // Derived + customConstEdgeIterator // Derived , Finite_vertices_iterator // Base , Vertex_handle // Value , boost::forward_traversal_tag // Traversal type