From a09a1f5278b0cc46a2a1488c0722fdd92e744849 Mon Sep 17 00:00:00 2001 From: ITBE-Lab Date: Fri, 12 May 2023 21:55:30 +0900 Subject: [PATCH] MSV - 2nd revision - coverage based SV call confidence scores - artifact filter in MA - various small things - sv_visualization plot size scaling --- CMakeLists.txt | 2 +- MA.md | 7 ++ MSV.md | 14 ++- libs/ma/src/container/alignment.cpp | 14 ++- libs/ma/src/container/soc.cpp | 9 +- libs/ma/src/module/harmonization.cpp | 29 ++++- libs/ms/inc/ms/util/parameter.h | 7 ++ .../sv_db/query_objects/coverageUpdater.h | 69 ++++++++++++ .../inc/msv/container/sv_db/tables/coverage.h | 106 ++++++++++++++++++ .../msv/container/sv_db/tables/svCallerRun.h | 2 +- libs/msv/inc/msv/module/svJumpsFromSeeds.h | 27 +++++ libs/msv/inc/msv/module/sweepSvJumps.h | 42 +++++-- .../inc/msv/util/statisticSequenceAnalysis.h | 1 - libs/msv/python/computeAccuracyRecall.py | 2 +- libs/msv/python/computeSvJumps.py | 30 ++++- .../sv_visualization/renderer/_setup.py | 6 +- .../renderer/visual_elements/main_plot.py | 5 +- .../renderer/visual_elements/nuc_plot.py | 9 +- .../renderer/visual_elements/read_plot.py | 25 +++-- .../renderer/visual_elements/seed_plot.py | 7 +- .../renderer/visual_elements/size_factor.py | 1 + .../renderer/visual_elements/widgets.py | 1 + libs/msv/python/sweepSvJumps.py | 4 +- .../sv_db/query_objects/coverageUpdater.cpp | 16 +++ libs/msv/src/container/sv_db/svSchema.cpp | 17 +++ libs/msv/src/module/svJumpsFromSeeds.cpp | 1 + libs/msv/src/module/sweepSvJumps.cpp | 2 +- 27 files changed, 403 insertions(+), 52 deletions(-) create mode 100644 libs/msv/inc/msv/container/sv_db/query_objects/coverageUpdater.h create mode 100644 libs/msv/inc/msv/container/sv_db/tables/coverage.h create mode 100644 libs/msv/python/sv_visualization/renderer/visual_elements/size_factor.py create mode 100644 libs/msv/src/container/sv_db/query_objects/coverageUpdater.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index fcc7d513..d1f01289 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required (VERSION 3.8) -project("MA" VERSION 2.0.1 DESCRIPTION "MA - The Modular Aligner") +project("MA" VERSION 2.0.2 DESCRIPTION "MA - The Modular Aligner") # optional features option(WITH_PYTHON "Build python integration of libMS." ON) diff --git a/MA.md b/MA.md index ee5c663c..88fd0fd3 100644 --- a/MA.md +++ b/MA.md @@ -120,6 +120,13 @@ The building of the GUI can be disabled using `-DWITH_GUI=OFF`. ## Changelog: +Version 2.0.2: + +* fix for the artifact filter, that removes seeds too far from the median delta distance of all seads in a harmonized SoC. Now the filter only removes seeds if this is advantageous to the SW score of the alignment. + +Version 2.0.1: + +* Revision for MSV - no changes to MA. Version 2.0.0: diff --git a/MSV.md b/MSV.md index 416b57e5..64cbef14 100644 --- a/MSV.md +++ b/MSV.md @@ -63,7 +63,8 @@ MSV represents SV calls (genomic rearrangements) via a skew-symmetric graph mode You can start the visualization tool from MA's main directory via the command: - bokeh serve --show MSV/sv_visualization/bokeh_server.py + cd build/MSV/sv_visualization + bokeh serve --show /bokeh_server.py This opens a browser window. Using the top-right drop-down buttons, select your dataset, run-id and ground-truth. @@ -83,3 +84,14 @@ After adjusting the slider, click the button "Compute Stats", which is the fourt ## Recreating the experiments of the manuscript The experiments of [[Preprint] State-of-the-art structural variant calling: What went conceptually wrong and how to fix it?](https://biorxiv.org/cgi/content/short/2021.01.12.426317v1 "bioRxiv preprint") can be recreated using the [MSV-EVAL repository](https://github.com/ITBE-Lab/MSV-EVAL "MSV-EVAL"). + +## Changelog + +Version 2.0.2: + +* 2nd revision for MSV +* The confidence scores for variant calls now make use of coverage information. Before, the conficance of a call was the number of reads supporting the call. Now it is the number of reads supporting over the the coverage in the region of the call. This helps with making calls in real world data, where coverage is uneven. + +Version 2.0.1: + +* 1st Revision for MSV - only minor changes. \ No newline at end of file diff --git a/libs/ma/src/container/alignment.cpp b/libs/ma/src/container/alignment.cpp index 974ff362..a0d139dc 100644 --- a/libs/ma/src/container/alignment.cpp +++ b/libs/ma/src/container/alignment.cpp @@ -274,6 +274,7 @@ void DLL_PORT( MA ) Alignment::removeDangeling( ) uiLength -= data.back( ).second; data.pop_back( ); } // if + //iScore = reCalcScore(); DEBUG( if( reCalcScore( ) != iScore ) { std::cerr << "WARNING set wrong score or removed wrong elements in remove dangeling" << std::endl; for( auto tup : vCopyOfData ) @@ -297,7 +298,11 @@ void DLL_PORT( MA ) Alignment::removeDangeling( ) int64_t Alignment::reCalcScore( ) const { int64_t iScore = 0; + //int64_t iSVScore = -pGlobalParams->uiSVPenalty->get( ); for( unsigned int index = 0; index < data.size( ); index++ ) + { + //if(iScore < iSVScore) + // iScore = iSVScore; switch( std::get<0>( data[ index ] ) ) { case MatchType::deletion: @@ -316,6 +321,9 @@ int64_t Alignment::reCalcScore( ) const iScore += pGlobalParams->iMatch->get( ) * data[ index ].second; break; } // switch + //if(iScore - pGlobalParams->uiSVPenalty->get( ) > iSVScore) + // iSVScore = iScore - pGlobalParams->uiSVPenalty->get( ); + } return iScore; } // function @@ -380,8 +388,12 @@ void exportAlignment( libMS::SubmoduleOrganizer& xOrganizer ) std::shared_ptr>>>( xOrganizer.container(), "AlignmentVector", "docstr" ); + xOrganizer.container().def("toAlignmentVector", [](std::shared_ptr pPtr){ + return std::dynamic_pointer_cast>>(pPtr); + }); + // tell boost python that pointers of these classes can be converted implicitly - py::implicitly_convertible>, libMS::Container>( ); + py::implicitly_convertible>>( ); } // function #endif \ No newline at end of file diff --git a/libs/ma/src/container/soc.cpp b/libs/ma/src/container/soc.cpp index da3691c2..86accaf5 100644 --- a/libs/ma/src/container/soc.cpp +++ b/libs/ma/src/container/soc.cpp @@ -7,11 +7,15 @@ using namespace libMA; #ifdef WITH_PYTHON +#include void exportSoC( libMS::SubmoduleOrganizer& xOrganizer ) { #if DEBUG_LEVEL >= 1 - py::bind_vector>>( xOrganizer._util(), "nucSeqPairVector", "docstr" ); + //py::bind_vector>>( xOrganizer._util(), "nucSeqPairVector", "docstr" ) + // .def("length", &std::vector>::size) + // .def("at", [](std::vector>& rSelf, size_t uiI){return rSelf[uiI];}) + // ; py::class_( xOrganizer._util(), "nucSeqNucSeqInterval" ) .def_readwrite( "first", &SoCPriorityQueue::blub::first ) @@ -32,7 +36,8 @@ void exportSoC( libMS::SubmoduleOrganizer& xOrganizer ) .def( "pop", &SoCPriorityQueue::pop ) .def( "make_heap", &SoCPriorityQueue::make_heap ) .def( "__len__", &SoCPriorityQueue::size ) - DEBUG(.def_readwrite( "scores", &SoCPriorityQueue::vScores ) + DEBUG( + .def_readwrite( "scores", &SoCPriorityQueue::vScores ) .def_readwrite( "extract", &SoCPriorityQueue::vExtractOrder ) .def_readwrite( "vSoCs", &SoCPriorityQueue::vSoCs ) .def_readwrite( "vHarmSoCs", &SoCPriorityQueue::vHarmSoCs ) diff --git a/libs/ma/src/module/harmonization.cpp b/libs/ma/src/module/harmonization.cpp index e7e110ab..b3e7e24b 100644 --- a/libs/ma/src/module/harmonization.cpp +++ b/libs/ma/src/module/harmonization.cpp @@ -153,7 +153,14 @@ std::shared_ptr Harmonization::applyFilters( std::shared_ptr& pIn double dDeltaDistDiff = std::abs( iDeltaDistToPre - iDeltaDistToPost ) * 2 / ( (double)iDeltaDistToPre + iDeltaDistToPost ); - if( dDeltaDistDiff < dMaxDeltaDist && (uint64_t)iDeltaDistToPre > uiMinDeltaDist ) + int64_t iScoreSeed = rCenter.size() * pGlobalParams->iMatch->get( ); + int64_t iIndelPenalty = std::min((int64_t)pGlobalParams->uiSVPenalty->get( ) * 2, + (int64_t)(pGlobalParams->iGap->get( ) * 2 + + pGlobalParams->iExtend->get( ) * (iDeltaDistToPre + iDeltaDistToPost)) + ); + + if( dDeltaDistDiff < dMaxDeltaDist && (uint64_t)iDeltaDistToPre > uiMinDeltaDist && + iScoreSeed < iIndelPenalty * 2 ) { // the filter triggered -> // we flag the seed to be ignored by setting it's size to zero @@ -255,6 +262,8 @@ std::shared_ptr Harmonization::harmonizeOne( std::shared_ptr& pSee pSeeds->xStats = pSeedsIn->xStats; if( pSeedsIn->size( ) > 1 ) { + + DEBUG( pSoCIn->vSoCs.push_back( pSeedsIn ); ) // DEBUG #if USE_RANSAC == 1 // switch between ransac line angle + intercept estimation & 45deg median line std::vector vX, vY; vX.reserve( pSeedsIn->size( ) ); @@ -281,7 +290,8 @@ std::shared_ptr Harmonization::harmonizeOne( std::shared_ptr& pSee pSeedsIn->erase( std::remove_if( pSeedsIn->begin( ), pSeedsIn->end( ), [ & ]( const Seed& rS ) { return deltaDistance( rS, xSlopeIntercept.first, - (int64_t)xSlopeIntercept.second ) > fMAD; + (int64_t)xSlopeIntercept.second ) > fMAD && + rS.size() < 50; } ), pSeedsIn->end( ) ); @@ -301,8 +311,8 @@ std::shared_ptr Harmonization::harmonizeOne( std::shared_ptr& pSee for( Seeds::iterator pSeed = pSeedsIn->begin( ); pSeed != pSeedsIn->end( ); pSeed++ ) { pShadows->push_back( std::make_tuple( pSeed, pSeed->start( ), pSeed->end_ref( ) ) ); - // std::cout << "(" << pSeed->start() << "," << pSeed->start_ref() << "," << - // pSeed->size() << ")," << std::endl; + //std::cout << "1 (" << pSeed->start() << "," << pSeed->start_ref() << "," << + //pSeed->size() << ")" << std::endl; } // for // perform the line sweep algorithm on the left shadows @@ -312,8 +322,12 @@ std::shared_ptr Harmonization::harmonizeOne( std::shared_ptr& pSee // get the right shadows for( auto& xT : *pShadows2 ) + { pShadows->push_back( std::make_tuple( std::get<0>( xT ), std::get<0>( xT )->start_ref( ), std::get<0>( xT )->end( ) ) ); + //std::cout << "2 (" << std::get<0>( xT )->start_ref() << "," << std::get<0>( xT )->end() << "," << + //std::get<0>( xT )->size() << ")" << std::endl; + } // perform the line sweep algorithm on the right shadows pShadows = linesweep( pShadows, (int64_t)xSlopeIntercept.second, xSlopeIntercept.first ); @@ -321,7 +335,11 @@ std::shared_ptr Harmonization::harmonizeOne( std::shared_ptr& pSee pSeeds->reserve( pShadows->size( ) ); for( auto& xT : *pShadows ) + { pSeeds->push_back( *std::get<0>( xT ) ); + // std::cout << "3 (" << std::get<0>( xT )->start() << "," << std::get<0>( xT )->start_ref() << "," << + // std::get<0>( xT )->size() << ")" << std::endl; + } pSeeds->bConsistent = true; @@ -367,7 +385,8 @@ std::shared_ptr Harmonization::harmonizeOne( std::shared_ptr& pSee pSoCIn->vIngroup.push_back( std::make_shared( ) ); pSoCIn->vSlopes.push_back( 0 ); pSoCIn->vIntercepts.push_back( 0 ); - pSoCIn->vHarmSoCs.push_back( std::make_shared( pSeeds ) ); ) // DEBUG + pSoCIn->vHarmSoCs.push_back( std::make_shared( pSeeds ) ); + pSoCIn->vSoCs.push_back( std::make_shared( pSeeds ) ); ) // DEBUG } // else return pSeeds; } diff --git a/libs/ms/inc/ms/util/parameter.h b/libs/ms/inc/ms/util/parameter.h index 8a2fceeb..c3d491e3 100644 --- a/libs/ms/inc/ms/util/parameter.h +++ b/libs/ms/inc/ms/util/parameter.h @@ -579,6 +579,7 @@ class Presetting : public ParameterSetBase AlignerParameterPointer xMaxFuzzinessFilter; // maximal fuzziness for sv calls AlignerParameterPointer xMaxSuppNtShortCallFilter; // maximal number of nt for short call low support filter AlignerParameterPointer xMaxCallSizeShortCallFilter; // maximal call size for short call low support filter + AlignerParameterPointer xMaxRegionSizeForAmbiguityFilter; // max size of the AlignerParameterPointer xMaxRefAmbiguityJump; // Max Ref Ambiguity Jump AlignerParameterPointer xMMFilterMaxOcc; // xMMFilterMaxOcc AlignerParameterPointer xMinNtInSoc; // Min NT in SoC @@ -802,6 +803,8 @@ class Presetting : public ParameterSetBase xMaxSuppNtShortCallFilter( this, "Max Supp Nt", "- unused -", SV_PARAMETERS, 10, checkPositiveValue ), xMaxCallSizeShortCallFilter( this, "Max Call Size Filter", "- unused -", SV_PARAMETERS, 20, checkPositiveValue ), + xMaxRegionSizeForAmbiguityFilter( this, "Max Regionsize Ambiguity Filter", "", SV_PARAMETERS, 200, + checkPositiveValue ), xMaxRefAmbiguityJump( this, "Max Ref Ambiguity Jump", "- unused -", SV_PARAMETERS, 10, checkPositiveValue ), xMMFilterMaxOcc( this, "Max Occ MM Filter", @@ -1028,6 +1031,7 @@ class GlobalParameter : public ParameterSetBase AlignerParameterPointer iExtend; AlignerParameterPointer iGap2; AlignerParameterPointer iExtend2; + AlignerParameterPointer xCoverageBinSize; // Coveragebin size AlignerParameterPointer uiSVPenalty; /* Constructor */ @@ -1052,6 +1056,9 @@ class GlobalParameter : public ParameterSetBase DP_PARAMETERS, 24, checkPositiveValue ), iExtend2( this, "Second Extend Penalty", "Second penalty for gap extension. (Two piece affine gap costs)", DP_PARAMETERS, 1, checkPositiveValue ), + xCoverageBinSize( this, "Coveragebin size", + "Binsize for storing the seed coverage on the genome", + SV_PARAMETERS, 100, checkPositiveValue ), uiSVPenalty( this, "Pick Local Seed Set C - Maximal Gap Penalty", "Maximal gap cost penalty during local seed set computation.", HEURISTIC_PARAMETERS, 100, checkPositiveValue ) diff --git a/libs/msv/inc/msv/container/sv_db/query_objects/coverageUpdater.h b/libs/msv/inc/msv/container/sv_db/query_objects/coverageUpdater.h new file mode 100644 index 00000000..85330133 --- /dev/null +++ b/libs/msv/inc/msv/container/sv_db/query_objects/coverageUpdater.h @@ -0,0 +1,69 @@ +/** + * @file coverageUpdater.h + * @brief + * @author Markus Schmidt + */ +#pragma once + +#include "ma/container/seed.h" +#include "ms/module/get_inserter_container_module.h" +#include "msv/container/sv_db/tables/coverage.h" +#include "util/geom.h" + +using namespace libMA; +using namespace libMS; + +namespace libMSV +{ +/** + * @brief Insertion of coverage into the database. + */ +template +class CoverageUpdaterContainer + : public InserterContainer +{ + public: + using ParentType = + InserterContainer; + const size_t uiBinSize; + + + CoverageUpdaterContainer( std::shared_ptr> pPool, int64_t iId, + std::shared_ptr pSharedProfiler ) + : ParentType::InserterContainer( pPool, iId, pSharedProfiler ), + uiBinSize(pGlobalParams->xCoverageBinSize->get()) + {} // constructur + + protected: + virtual size_t insert_override( std::shared_ptr pSeeds ) + { + std::set xCoveredBinIds; + for( Seed& rSeed : *pSeeds ) + { + const nucSeqIndex uiStartBinId = rSeed.start_ref_cons_rev() / uiBinSize; + const nucSeqIndex uiEndBinId = 1 + (rSeed.end_ref_cons_rev() - 1) / uiBinSize; + + for(nucSeqIndex uiX = uiStartBinId; uiX < uiEndBinId; uiX++) + xCoveredBinIds.insert(uiX); + } // for + + for(int64_t uiBinId : xCoveredBinIds) + ParentType::pInserter->incCoverage( ParentType::iId /* <- id of the sv jump run */, uiBinId, (uint32_t)1 ); + return xCoveredBinIds.size( ); + } // method +}; // class + +template +using GetCoverageUpdaterContainerModule = + GetInserterContainerModule; + + +template using CoverageUpdaterModule = InserterModule>; + + +} // namespace libMSV + +#ifdef WITH_PYTHON +/// @brief used to expose libMSV::CoverageUpdaterContainer to python +void exportCoverageUpdater( libMS::SubmoduleOrganizer& xOrganizer ); +#endif diff --git a/libs/msv/inc/msv/container/sv_db/tables/coverage.h b/libs/msv/inc/msv/container/sv_db/tables/coverage.h new file mode 100644 index 00000000..4f43d391 --- /dev/null +++ b/libs/msv/inc/msv/container/sv_db/tables/coverage.h @@ -0,0 +1,106 @@ +/** + * @file sequencer.h + * @details + * Database interface for the structural variant caller. + * One table of the database. + */ +#pragma once + +#include "sql_api.h" // NEW DATABASE INTERFACE + +namespace libMSV +{ + + +class CoverageCollector : public libMS::Container +{ +public: + const size_t uiBinSize; + std::vector> vCoverage; + + CoverageCollector(libMA::nucSeqIndex uiGenomeSize) : + uiBinSize(pGlobalParams->xCoverageBinSize->get()), + vCoverage(uiGenomeSize / uiBinSize) + {} + + inline void addRead(std::set xCoverage) + { + for(size_t uiPos : xCoverage) + vCoverage[uiPos]++; // atomic so no parralelization needed + } + +}; + + +template +using CoverageTableType = SQLTableWithAutoPriKey; +const json jCoverageTableDef = { { TABLE_NAME, "coverage_table" }, + { TABLE_COLUMNS, { { { COLUMN_NAME, "run_id" } }, + { { COLUMN_NAME, "bin_id" } }, + { { COLUMN_NAME, "bin_coverage" } } } } }; + +/** + * @brief contains the name of a sequencer run + * @details + * In order to create sv jumps with different parameters for different read types, + * each read type has to have a entry in the sequencer_table. + * Then jumps can be computed separated for each sequencer_table entry. + * Although the jumps are created separated, they can all be used together in the line sweep. + */ +template class CoverageTable : public CoverageTableType +{ + SQLQuery xQueryCoverage; + SQLStatement xUpdateCoverage; + + + public: + CoverageTable( std::shared_ptr pDB ) : + CoverageTableType( pDB, jCoverageTableDef ), + xQueryCoverage( pDB, "SELECT bin_coverage FROM coverage_table WHERE run_id = ? AND bin_id = ? LIMIT 1" ), + xUpdateCoverage( pDB, + "UPDATE coverage_table " + "SET bin_coverage = bin_coverage + ? " + "WHERE run_id = ? " + "AND bin_id = ? " ) + {} // default constructor + + inline void createIndices( ) + { + this->addIndex( + json{ { INDEX_NAME, "bin_id_index" }, + { INDEX_COLUMNS, "run_id, bin_id" } } ); + } // method + + inline void dropIndices( ) + { + this->dropIndex( json{ { INDEX_NAME, "bin_id_index" } } ); + } + + inline uint32_t getCoverage( int64_t jump_run_id, int64_t uiBinId ) + { + return xQueryCoverage.scalar( jump_run_id, uiBinId ); + } // method + + inline void incCoverage( int64_t jump_run_id, int64_t uiBinId, uint32_t uiAmount ) + { + xUpdateCoverage.exec( uiAmount, jump_run_id, uiBinId ); + } // method + + inline void initCoverage( int64_t jump_run_id, int64_t uiNumBins ) + { + for(size_t uiI = 0; uiI < uiNumBins; uiI++) + CoverageTableType::insert( jump_run_id, uiI, 0 ); + } + + inline void initCoverageFromColl( int64_t jump_run_id, CoverageCollector& rColl ) + { + for(size_t uiI = 0; uiI < rColl.vCoverage.size(); uiI++) + CoverageTableType::insert( jump_run_id, uiI, rColl.vCoverage[uiI] ); + } +}; // class + +} // namespace libMSV diff --git a/libs/msv/inc/msv/container/sv_db/tables/svCallerRun.h b/libs/msv/inc/msv/container/sv_db/tables/svCallerRun.h index 1bbb99c4..f8486d8c 100644 --- a/libs/msv/inc/msv/container/sv_db/tables/svCallerRun.h +++ b/libs/msv/inc/msv/container/sv_db/tables/svCallerRun.h @@ -49,7 +49,7 @@ template class SvCallerRunTable : public SvCallerRunTableType +{ + const size_t uiBinSize; + public: + CollectSeedCoverage( const ParameterSetManager& ) + : uiBinSize(pGlobalParams->xCoverageBinSize->get()) + {} // constructor + + std::shared_ptr + execute( std::shared_ptr pSeeds, std::shared_ptr pCollector ) + { + std::set xCoveredBinIds; + for( Seed& rSeed : *pSeeds ) + { + const nucSeqIndex uiStartBinId = rSeed.start_ref_cons_rev() / uiBinSize; + const nucSeqIndex uiEndBinId = 1 + (rSeed.end_ref_cons_rev() - 1) / uiBinSize; + + for(nucSeqIndex uiX = uiStartBinId; uiX < uiEndBinId; uiX++) + xCoveredBinIds.insert(uiX); + } // for + pCollector->addRead(xCoveredBinIds); + return std::make_shared(); + } // method +}; // class + }; // namespace libMSV #ifdef WITH_PYTHON diff --git a/libs/msv/inc/msv/module/sweepSvJumps.h b/libs/msv/inc/msv/module/sweepSvJumps.h index de914263..6c23c32b 100644 --- a/libs/msv/inc/msv/module/sweepSvJumps.h +++ b/libs/msv/inc/msv/module/sweepSvJumps.h @@ -10,6 +10,7 @@ #include "msv/container/squeezedVector.h" #include "msv/container/sv_db/query_objects/callInserter.h" // NEW DB API implemented #include "msv/container/sv_db/query_objects/fetchSvJump.h" +#include "msv/container/sv_db/tables/coverage.h" #include "msv/module/abstractFilter.h" #include "msv/util/statisticSequenceAnalysis.h" #include @@ -634,27 +635,44 @@ class FilterDiagonalLineCalls * - if we don't switch strand we have to match the two 'left's and two 'right's * - we always have to pick one 'from' and one 'to' together. */ +template class ComputeCallAmbiguity - : public Module + : public Module> { nucSeqIndex uiDistance; + size_t uiSvJumpRunId; + size_t uiBinSize; public: - ComputeCallAmbiguity( const ParameterSetManager& rParameters ) - : uiDistance( rParameters.getSelected( )->xMaxCallSizeShortCallFilter->get( ) ) + ComputeCallAmbiguity( const ParameterSetManager& rParameters, size_t uiSvJumpRunId ) + : uiDistance( rParameters.getSelected( )->xMaxRegionSizeForAmbiguityFilter->get( ) ), + uiSvJumpRunId(uiSvJumpRunId), + uiBinSize(pGlobalParams->xCoverageBinSize->get()) {} // constructor std::shared_ptr - execute( std::shared_ptr pCalls, std::shared_ptr pPack ) + execute( std::shared_ptr pCalls, std::shared_ptr pPack, + std::shared_ptr> pPool ) { - for( auto pCall : pCalls->vContent ) - { - auto f = pCall->xXAxis.start( ) + pCall->xXAxis.size( ) / 2; - auto t = pCall->xYAxis.start( ) + pCall->xYAxis.size( ) / 2; - - pCall->uiReferenceAmbiguity = - sampleSequenceAmbiguity( f, t, pCall->bFromForward, pCall->bToForward, pPack, uiDistance, 5 ); - } // for + pPool->xPool.run( + [ this ]( auto pConnection, std::shared_ptr pCalls, + std::shared_ptr pPack ) + { + CoverageTable xCovTable(pConnection); + for( auto pCall : pCalls->vContent ) + { + auto f = pCall->xXAxis.start( ) + pCall->xXAxis.size( ) / 2; + auto t = pCall->xYAxis.start( ) + pCall->xYAxis.size( ) / 2; + + pCall->uiReferenceAmbiguity = + sampleSequenceAmbiguity( f, t, pCall->bFromForward, pCall->bToForward, pPack, uiDistance, 5 ) + + xCovTable.getCoverage(uiSvJumpRunId, f / uiBinSize) + + xCovTable.getCoverage(uiSvJumpRunId, t / uiBinSize) + ; + } // for + }, + pCalls, pPack + ); return pCalls; } // method }; // class diff --git a/libs/msv/inc/msv/util/statisticSequenceAnalysis.h b/libs/msv/inc/msv/util/statisticSequenceAnalysis.h index b3daf4da..c6fbfa10 100644 --- a/libs/msv/inc/msv/util/statisticSequenceAnalysis.h +++ b/libs/msv/inc/msv/util/statisticSequenceAnalysis.h @@ -51,7 +51,6 @@ inline nucSeqIndex sampleSequenceAmbiguity( NucSeq& rSequence, double t ) inline nucSeqIndex DLL_PORT( MSV ) sampleAmbiguity( std::shared_ptr pSeqA, std::shared_ptr pSeqB ) { - // + 1 to avoind division by zero error return std::max( 1, (int)sampleSequenceAmbiguity( *pSeqA, *pSeqB, 0.001 ) - (int)pSeqA->length( ) - (int)pSeqB->length( ) ); } // method diff --git a/libs/msv/python/computeAccuracyRecall.py b/libs/msv/python/computeAccuracyRecall.py index a8655d67..618ecb98 100644 --- a/libs/msv/python/computeAccuracyRecall.py +++ b/libs/msv/python/computeAccuracyRecall.py @@ -9,7 +9,7 @@ def compute_accuracy_recall(dataset_name, blurs, gt_ids, run_ids, out_prefix): if not gt_id is None: for run_id in run_ids[gt_id]: if not run_id is None: - print("computing", blur, gt_id, run_id) + print("computing", dataset_name, blur, gt_id, run_id, out_prefix) stats, gt_total = count_calls_from_db.count(run_id, gt_id, blur) with open(out_prefix + dataset_name + "-" + str(run_id) + "-" + str(blur) + ".tsv", "w") as out_file: out_file.write("//|ground truth| = " + str(gt_total) + "\n") diff --git a/libs/msv/python/computeSvJumps.py b/libs/msv/python/computeSvJumps.py index b0910a49..4cfef8db 100644 --- a/libs/msv/python/computeSvJumps.py +++ b/libs/msv/python/computeSvJumps.py @@ -26,6 +26,16 @@ def scope(): "python built comp graph") jump_inserter_module = JumpInserterModule(parameter_set_manager) + #cov_table.init_coverage(get_jump_inserter.cpp_module.id, + # pack.unpacked_size_single_strand // parameter_set_manager.by_name("Coveragebin size").get()) + #get_coverage_updater = GetCoverageUpdater(parameter_set_manager, get_jump_inserter.cpp_module.id) + #coverage_updater_module = CoverageUpdaterModule(parameter_set_manager) + cc_module = CollectSeedCoverage(parameter_set_manager) + + join_module = ContainerJoin(parameter_set_manager) + + cc_col = Pledge() + cc_col.set(CoverageCollector(pack.unpacked_size_single_strand)) mm_pledge = Pledge() mm_pledge.set(mm_index) pack_pledge = Pledge() @@ -81,14 +91,26 @@ def scope(): write_to_db_pledge = promise_me(jump_inserter_module, jump_inserter, pool_pledge, filtered_jumps, query_pledge) analyze.register("JumpInserterModule", write_to_db_pledge, True) - unlock_pledge = promise_me(UnLock(parameter_set_manager, query_pledge), write_to_db_pledge) + #cov_updater = promise_me(get_coverage_updater, pool_pledge) + #inserter_vec.append(cov_updater) + #analyze.register("GetCovUpdater", cov_updater, True) + #write_to_db_pledge_2 = promise_me(coverage_updater_module, cov_updater, pool_pledge, + # filtered_seeds_pledge_3) + #analyze.register("CoverageUpdaterModule", write_to_db_pledge_2, True) + write_to_db_pledge_2 = promise_me(cc_module, filtered_seeds_pledge_3, cc_col) + join_pledge = promise_me(join_module, write_to_db_pledge, write_to_db_pledge_2) + analyze.register("Join", join_pledge, True) + unlock_pledge = promise_me(UnLock(parameter_set_manager, query_pledge), join_pledge) analyze.register("UnLock", unlock_pledge, True) res.append(unlock_pledge) # drain all sources res.simultaneous_get(parameter_set_manager.get_num_threads()) for inserter in inserter_vec: - inserter.get().close(pool_pledge.get()) # @todo for some reason the destructor does not trigger automatically :( + # @todo for some reason the destructor does not trigger automatically :( + inserter.get().close(pool_pledge.get()) + + cov_table.init_coverage_from_col(get_jump_inserter.cpp_module.id, cc_col.get()) analyze.analyze(runtime_file) @@ -100,6 +122,9 @@ def scope(): jump_table = SvJumpTable(db_conn) jump_table.drop_indices(0) # number does nothing at the moment + cov_table = CoverageTable(db_conn) + cov_table.drop_indices() + jump_id = scope() analyze = AnalyzeRuntimes() @@ -109,6 +134,7 @@ def scope(): print("creating index...") start = datetime.datetime.now() jump_table.create_indices( jump_id ) + cov_table.create_indices() end = datetime.datetime.now() delta = end - start analyze.register("create_indices", delta.total_seconds(), False, lambda x: x) diff --git a/libs/msv/python/sv_visualization/renderer/_setup.py b/libs/msv/python/sv_visualization/renderer/_setup.py index 89c3ce24..2c30e149 100644 --- a/libs/msv/python/sv_visualization/renderer/_setup.py +++ b/libs/msv/python/sv_visualization/renderer/_setup.py @@ -94,4 +94,8 @@ def setup(self): self.widgets.run_id_dropdown.menu = menu self.widgets.ground_truth_id_dropdown.menu = menu - self.render() \ No newline at end of file + self.render() + elif dataset_name is None: + print("missing .json file for", dataset_name) + else: + print("missing .json file for", dataset_name, "at:", JSON_PREFIX + dataset_name + "/info.json") \ No newline at end of file diff --git a/libs/msv/python/sv_visualization/renderer/visual_elements/main_plot.py b/libs/msv/python/sv_visualization/renderer/visual_elements/main_plot.py index 404f81e9..9422acfd 100644 --- a/libs/msv/python/sv_visualization/renderer/visual_elements/main_plot.py +++ b/libs/msv/python/sv_visualization/renderer/visual_elements/main_plot.py @@ -3,12 +3,13 @@ from bokeh.plotting import ColumnDataSource import copy from bokeh.events import Tap +from .size_factor import PLOT_SIZE_FAC class MainPlot: def __init__(self, renderer): self.plot = figure( - width=900, - height=900, + width=int(90 * PLOT_SIZE_FAC), + height=int(90 * PLOT_SIZE_FAC), tools=[ "pan", "box_zoom", "wheel_zoom", "save", diff --git a/libs/msv/python/sv_visualization/renderer/visual_elements/nuc_plot.py b/libs/msv/python/sv_visualization/renderer/visual_elements/nuc_plot.py index 95d19b81..e1f76da5 100644 --- a/libs/msv/python/sv_visualization/renderer/visual_elements/nuc_plot.py +++ b/libs/msv/python/sv_visualization/renderer/visual_elements/nuc_plot.py @@ -1,12 +1,13 @@ from bokeh.plotting import figure from bokeh.models.tools import HoverTool from bokeh.plotting import ColumnDataSource +from .size_factor import PLOT_SIZE_FAC class NucPlot: def __init__(self, main_plot): self.left_plot = figure( - width=40, - height=900, + width=int(4 * PLOT_SIZE_FAC), + height=int(90 * PLOT_SIZE_FAC), y_range=main_plot.plot.y_range, tools=["ypan", "ywheel_zoom"], active_scroll="ywheel_zoom", @@ -16,8 +17,8 @@ def __init__(self, main_plot): self.left_plot.grid.visible = False self.bottom_plot = figure( - width=900, - height=40, + width=int(90 * PLOT_SIZE_FAC), + height=int(4 * PLOT_SIZE_FAC), x_range=main_plot.plot.x_range, tools=["xpan", "xwheel_zoom"], active_scroll="xwheel_zoom", diff --git a/libs/msv/python/sv_visualization/renderer/visual_elements/read_plot.py b/libs/msv/python/sv_visualization/renderer/visual_elements/read_plot.py index a1c211ad..5ff57204 100644 --- a/libs/msv/python/sv_visualization/renderer/visual_elements/read_plot.py +++ b/libs/msv/python/sv_visualization/renderer/visual_elements/read_plot.py @@ -5,12 +5,13 @@ from bokeh.palettes import Plasma256 from bokeh.events import Tap import math +from .size_factor import PLOT_SIZE_FAC class ReadPlotNucs: def __init__(self, nuc_plot, read_plot): self.left_plot = figure( - width=100, - height=400, + width=int(20 * PLOT_SIZE_FAC), + height=int(40 * PLOT_SIZE_FAC), y_range=read_plot.plot.y_range, tools=["ypan", "ywheel_zoom"], active_scroll="ywheel_zoom", @@ -21,8 +22,8 @@ def __init__(self, nuc_plot, read_plot): self.left_plot.yaxis.axis_label = "Read Position" self.bottom_plot = figure( - width=900, - height=200, + width=int(90 * PLOT_SIZE_FAC), + height=int(20 * PLOT_SIZE_FAC), x_range=read_plot.plot.x_range, tools=["xpan", "xwheel_zoom"], active_scroll="xwheel_zoom", @@ -64,8 +65,8 @@ class ReadPlot: def __init__(self, nuc_plot, renderer): self.plot = figure( - width=900, - height=400, + width=int(90 * PLOT_SIZE_FAC), + height=int(40 * PLOT_SIZE_FAC), tools=[ "pan", "box_zoom", "wheel_zoom", "reset" @@ -121,8 +122,8 @@ def __init__(self, nuc_plot, renderer): self.recalc_stat = True stat_plot1 = figure( - width=900, - height=250, + width=int(90 * PLOT_SIZE_FAC), + height=int(50 * PLOT_SIZE_FAC), tools=[ "pan", "box_zoom", "wheel_zoom", "reset" @@ -137,8 +138,8 @@ def __init__(self, nuc_plot, renderer): tab1 = Panel(child=stat_plot1, title="Min Score") stat_plot4 = figure( - width=900, - height=250, + width=int(90 * PLOT_SIZE_FAC), + height=stat_plot1.height, tools=[ "pan", "box_zoom", "wheel_zoom", "reset" @@ -152,7 +153,7 @@ def __init__(self, nuc_plot, renderer): tab4 = Panel(child=stat_plot4, title="Recall & Accuracy") stat_plot2 = figure( - width=900, + width=int(90 * PLOT_SIZE_FAC), height=stat_plot1.height, tools=[ "pan", "box_zoom", @@ -168,7 +169,7 @@ def __init__(self, nuc_plot, renderer): tab2 = Panel(child=stat_plot2, title="Blur") stat_plot3 = figure( - width=900, + width=int(90 * PLOT_SIZE_FAC), height=stat_plot1.height, tools=[ "pan", "box_zoom", diff --git a/libs/msv/python/sv_visualization/renderer/visual_elements/seed_plot.py b/libs/msv/python/sv_visualization/renderer/visual_elements/seed_plot.py index 46b1498d..640e0e4a 100644 --- a/libs/msv/python/sv_visualization/renderer/visual_elements/seed_plot.py +++ b/libs/msv/python/sv_visualization/renderer/visual_elements/seed_plot.py @@ -6,13 +6,14 @@ from bokeh.events import Tap import math import copy +from .size_factor import PLOT_SIZE_FAC class SeedPlot: def __init__(self, main_plot, renderer): - self.plot_width = 300 + self.plot_width = int(40 * PLOT_SIZE_FAC) self.left_plot = figure( width=self.plot_width, - height=900, + height=int(90 * PLOT_SIZE_FAC), y_range=main_plot.plot.y_range, x_range=(0,0), tools=["xpan", "xwheel_zoom"], @@ -25,7 +26,7 @@ def __init__(self, main_plot, renderer): self.left_plot.rect(x=0.5, y=0, width=1, height=1, color="white", alpha=0) self.bottom_plot = figure( - width=900, + width=int(90 * PLOT_SIZE_FAC), height=self.plot_width, x_range=main_plot.plot.x_range, y_range=self.left_plot.x_range, diff --git a/libs/msv/python/sv_visualization/renderer/visual_elements/size_factor.py b/libs/msv/python/sv_visualization/renderer/visual_elements/size_factor.py new file mode 100644 index 00000000..3c425bea --- /dev/null +++ b/libs/msv/python/sv_visualization/renderer/visual_elements/size_factor.py @@ -0,0 +1 @@ +PLOT_SIZE_FAC = 5 \ No newline at end of file diff --git a/libs/msv/python/sv_visualization/renderer/visual_elements/widgets.py b/libs/msv/python/sv_visualization/renderer/visual_elements/widgets.py index ddf49231..b6952389 100644 --- a/libs/msv/python/sv_visualization/renderer/visual_elements/widgets.py +++ b/libs/msv/python/sv_visualization/renderer/visual_elements/widgets.py @@ -7,6 +7,7 @@ from MSV import * import copy import threading +from .size_factor import PLOT_SIZE_FAC class Widgets: def __init__(self, renderer): diff --git a/libs/msv/python/sweepSvJumps.py b/libs/msv/python/sweepSvJumps.py index 209ea079..dbe6cc18 100644 --- a/libs/msv/python/sweepSvJumps.py +++ b/libs/msv/python/sweepSvJumps.py @@ -34,7 +34,7 @@ def graph(pool): filter2 = FilterFuzzyCalls(parameter_set_manager) filter5 = FilterDiagonalLineCalls(parameter_set_manager) filter6 = FilterLowScoreCalls(parameter_set_manager) - call_ambiguity = ComputeCallAmbiguity(parameter_set_manager) + call_ambiguity = ComputeCallAmbiguity(parameter_set_manager, run_id) get_call_inserter = GetCallVectorInserter(parameter_set_manager, DbConn(dataset_name), name, desc, run_id) call_inserter_module = CallVectorInserterModule(parameter_set_manager) @@ -76,7 +76,7 @@ def graph(pool): #filter3_pledge = promise_me(filter5, filter2_pledge) #analyze.register("FilterDiagonalLineCalls", filter3_pledge, True) - call_ambiguity_pledge = promise_me(call_ambiguity, sweep2_pledge, pack_pledge) + call_ambiguity_pledge = promise_me(call_ambiguity, sweep2_pledge, pack_pledge, pool_pledge) analyze.register("ComputeCallAmbiguity", call_ambiguity_pledge, True) #filter6_pledge = promise_me(filter6, call_ambiguity_pledge) diff --git a/libs/msv/src/container/sv_db/query_objects/coverageUpdater.cpp b/libs/msv/src/container/sv_db/query_objects/coverageUpdater.cpp new file mode 100644 index 00000000..fb6b0dbe --- /dev/null +++ b/libs/msv/src/container/sv_db/query_objects/coverageUpdater.cpp @@ -0,0 +1,16 @@ +#include "msv/container/sv_db/query_objects/coverageUpdater.h" + +using namespace libMSV; + +#ifdef WITH_PYTHON + +#include "ms/container/sv_db/py_db_conf.h" + +void exportCoverageUpdater( libMS::SubmoduleOrganizer& xOrganizer ) +{ + exportInserterContainer>( xOrganizer, "CoverageUpdater" ); + + exportModule>( xOrganizer, "CoverageUpdaterModule" ); +} // function + +#endif diff --git a/libs/msv/src/container/sv_db/svSchema.cpp b/libs/msv/src/container/sv_db/svSchema.cpp index 88dd5068..c54424e6 100644 --- a/libs/msv/src/container/sv_db/svSchema.cpp +++ b/libs/msv/src/container/sv_db/svSchema.cpp @@ -9,6 +9,7 @@ #include "msv/container/sv_db/query_objects/fetchCalls.h" #include "msv/container/sv_db/query_objects/fetchSvJump.h" #include "msv/container/sv_db/query_objects/jumpInserter.h" +#include "msv/container/sv_db/query_objects/coverageUpdater.h" #include "msv/container/sv_db/query_objects/kMerInserter.h" #include "msv/container/sv_db/query_objects/nucSeqSql.h" #include "msv/container/sv_db/query_objects/readInserter.h" @@ -135,6 +136,21 @@ void exportSoCDbWriter( libMS::SubmoduleOrganizer& xOrganizer ) py::class_, std::shared_ptr>>( xOrganizer.util( ), "JumpRunTable" ) .def( py::init>( ) ); + + + py::class_>( xOrganizer.util( ), "CoverageCollector" ) + .def( py::init( ) ); + + py::class_, std::shared_ptr>>( xOrganizer.util( ), + "CoverageTable" ) + .def( py::init>( ) ) + .def( "get_coverage", &CoverageTable::getCoverage ) + .def( "drop_indices", &CoverageTable::dropIndices ) + .def( "create_indices", &CoverageTable::createIndices ) + .def( "init_coverage", &CoverageTable::initCoverage ) + .def( "init_coverage_from_col", &CoverageTable::initCoverageFromColl ) + ; + py::class_, std::shared_ptr>>( xOrganizer.util( ), "SequencerTable" ) .def( py::init>( ) ); @@ -218,6 +234,7 @@ void exportSoCDbWriter( libMS::SubmoduleOrganizer& xOrganizer ) exportCallsFromDb( xOrganizer ); exportSvJump( xOrganizer ); exportSvJumpInserter( xOrganizer ); + exportCoverageUpdater( xOrganizer ); exportNucSeqSql( xOrganizer ); exportReadInserter( xOrganizer ); exportKMerInserter( xOrganizer ); diff --git a/libs/msv/src/module/svJumpsFromSeeds.cpp b/libs/msv/src/module/svJumpsFromSeeds.cpp index 1f3ca857..519fa279 100644 --- a/libs/msv/src/module/svJumpsFromSeeds.cpp +++ b/libs/msv/src/module/svJumpsFromSeeds.cpp @@ -401,6 +401,7 @@ void exportSvJumpsFromSeeds( libMS::SubmoduleOrganizer& xOrganizer ) exportModule( xOrganizer, "FilterJumpsByRegion" ); exportModule( xOrganizer, "FilterJumpsByRegionSquare" ); exportModule( xOrganizer, "FilterJumpsByRefAmbiguity" ); + exportModule( xOrganizer, "CollectSeedCoverage" ); xOrganizer.util( ).def( "compute_seeds_area", &computeSeeds ); } // function diff --git a/libs/msv/src/module/sweepSvJumps.cpp b/libs/msv/src/module/sweepSvJumps.cpp index cc806a35..0a68faae 100644 --- a/libs/msv/src/module/sweepSvJumps.cpp +++ b/libs/msv/src/module/sweepSvJumps.cpp @@ -45,7 +45,7 @@ void exportSweepSvJump( libMS::SubmoduleOrganizer& xOrganizer ) exportModule( xOrganizer, "FilterLowScoreCalls" ); exportModule( xOrganizer, "FilterDiagonalLineCalls" ); exportModule( xOrganizer, "FilterFuzzyCalls" ); - exportModule( xOrganizer, "ComputeCallAmbiguity" ); + exportModule, size_t>( xOrganizer, "ComputeCallAmbiguity" ); py::class_( xOrganizer.util( ), "AbstractFilter" ) .def_readwrite_static( "silent", &AbstractFilter::bSilent );