Skip to content

Commit

Permalink
MSV - 2nd revision
Browse files Browse the repository at this point in the history
- coverage based SV call confidence scores
- artifact filter in MA
- various small things
- sv_visualization plot size scaling
  • Loading branch information
MarkusRainerSchmidt committed May 12, 2023
1 parent c7ab8e2 commit a09a1f5
Show file tree
Hide file tree
Showing 27 changed files with 403 additions and 52 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

cmake_minimum_required (VERSION 3.8)

project("MA" VERSION 2.0.1 DESCRIPTION "MA - The Modular Aligner")
project("MA" VERSION 2.0.2 DESCRIPTION "MA - The Modular Aligner")

# optional features
option(WITH_PYTHON "Build python integration of libMS." ON)
Expand Down
7 changes: 7 additions & 0 deletions MA.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ The building of the GUI can be disabled using `-DWITH_GUI=OFF`.

## Changelog:

Version 2.0.2:

* fix for the artifact filter, that removes seeds too far from the median delta distance of all seads in a harmonized SoC. Now the filter only removes seeds if this is advantageous to the SW score of the alignment.

Version 2.0.1:

* Revision for MSV - no changes to MA.

Version 2.0.0:

Expand Down
14 changes: 13 additions & 1 deletion MSV.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ MSV represents SV calls (genomic rearrangements) via a skew-symmetric graph mode

You can start the visualization tool from MA's main directory via the command:

bokeh serve --show MSV/sv_visualization/bokeh_server.py
cd build/MSV/sv_visualization
bokeh serve --show /bokeh_server.py
This opens a browser window.

Using the top-right drop-down buttons, select your dataset, run-id and ground-truth.
Expand All @@ -83,3 +84,14 @@ After adjusting the slider, click the button "Compute Stats", which is the fourt
## Recreating the experiments of the manuscript
The experiments of [[Preprint] State-of-the-art structural variant calling: What went conceptually wrong and how to fix it?](https://biorxiv.org/cgi/content/short/2021.01.12.426317v1 "bioRxiv preprint")
can be recreated using the [MSV-EVAL repository](https://github.com/ITBE-Lab/MSV-EVAL "MSV-EVAL").

## Changelog

Version 2.0.2:

* 2nd revision for MSV
* The confidence scores for variant calls now make use of coverage information. Before, the conficance of a call was the number of reads supporting the call. Now it is the number of reads supporting over the the coverage in the region of the call. This helps with making calls in real world data, where coverage is uneven.

Version 2.0.1:

* 1st Revision for MSV - only minor changes.
14 changes: 13 additions & 1 deletion libs/ma/src/container/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ void DLL_PORT( MA ) Alignment::removeDangeling( )
uiLength -= data.back( ).second;
data.pop_back( );
} // if
//iScore = reCalcScore();
DEBUG( if( reCalcScore( ) != iScore ) {
std::cerr << "WARNING set wrong score or removed wrong elements in remove dangeling" << std::endl;
for( auto tup : vCopyOfData )
Expand All @@ -297,7 +298,11 @@ void DLL_PORT( MA ) Alignment::removeDangeling( )
int64_t Alignment::reCalcScore( ) const
{
int64_t iScore = 0;
//int64_t iSVScore = -pGlobalParams->uiSVPenalty->get( );
for( unsigned int index = 0; index < data.size( ); index++ )
{
//if(iScore < iSVScore)
// iScore = iSVScore;
switch( std::get<0>( data[ index ] ) )
{
case MatchType::deletion:
Expand All @@ -316,6 +321,9 @@ int64_t Alignment::reCalcScore( ) const
iScore += pGlobalParams->iMatch->get( ) * data[ index ].second;
break;
} // switch
//if(iScore - pGlobalParams->uiSVPenalty->get( ) > iSVScore)
// iSVScore = iScore - pGlobalParams->uiSVPenalty->get( );
}
return iScore;
} // function

Expand Down Expand Up @@ -380,8 +388,12 @@ void exportAlignment( libMS::SubmoduleOrganizer& xOrganizer )
std::shared_ptr<libMS::ContainerVector<std::shared_ptr<Alignment>>>>(
xOrganizer.container(), "AlignmentVector", "docstr" );

xOrganizer.container().def("toAlignmentVector", [](std::shared_ptr<libMS::Container> pPtr){
return std::dynamic_pointer_cast<libMS::ContainerVector<std::shared_ptr<Alignment>>>(pPtr);
});

// tell boost python that pointers of these classes can be converted implicitly
py::implicitly_convertible<libMS::ContainerVector<std::shared_ptr<Alignment>>, libMS::Container>( );
py::implicitly_convertible<libMS::Container, libMS::ContainerVector<std::shared_ptr<Alignment>>>( );

} // function
#endif
9 changes: 7 additions & 2 deletions libs/ma/src/container/soc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
using namespace libMA;

#ifdef WITH_PYTHON
#include <pybind11/stl.h>

void exportSoC( libMS::SubmoduleOrganizer& xOrganizer )
{
#if DEBUG_LEVEL >= 1
py::bind_vector<std::vector<std::pair<nucSeqIndex, nucSeqIndex>>>( xOrganizer._util(), "nucSeqPairVector", "docstr" );
//py::bind_vector<std::vector<std::pair<nucSeqIndex, nucSeqIndex>>>( xOrganizer._util(), "nucSeqPairVector", "docstr" )
// .def("length", &std::vector<std::pair<nucSeqIndex, nucSeqIndex>>::size)
// .def("at", [](std::vector<std::pair<nucSeqIndex, nucSeqIndex>>& rSelf, size_t uiI){return rSelf[uiI];})
// ;

py::class_<SoCPriorityQueue::blub>( xOrganizer._util(), "nucSeqNucSeqInterval" )
.def_readwrite( "first", &SoCPriorityQueue::blub::first )
Expand All @@ -32,7 +36,8 @@ void exportSoC( libMS::SubmoduleOrganizer& xOrganizer )
.def( "pop", &SoCPriorityQueue::pop )
.def( "make_heap", &SoCPriorityQueue::make_heap )
.def( "__len__", &SoCPriorityQueue::size )
DEBUG(.def_readwrite( "scores", &SoCPriorityQueue::vScores )
DEBUG(
.def_readwrite( "scores", &SoCPriorityQueue::vScores )
.def_readwrite( "extract", &SoCPriorityQueue::vExtractOrder )
.def_readwrite( "vSoCs", &SoCPriorityQueue::vSoCs )
.def_readwrite( "vHarmSoCs", &SoCPriorityQueue::vHarmSoCs )
Expand Down
29 changes: 24 additions & 5 deletions libs/ma/src/module/harmonization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,14 @@ std::shared_ptr<Seeds> Harmonization::applyFilters( std::shared_ptr<Seeds>& pIn
double dDeltaDistDiff =
std::abs( iDeltaDistToPre - iDeltaDistToPost ) * 2 / ( (double)iDeltaDistToPre + iDeltaDistToPost );

if( dDeltaDistDiff < dMaxDeltaDist && (uint64_t)iDeltaDistToPre > uiMinDeltaDist )
int64_t iScoreSeed = rCenter.size() * pGlobalParams->iMatch->get( );
int64_t iIndelPenalty = std::min((int64_t)pGlobalParams->uiSVPenalty->get( ) * 2,
(int64_t)(pGlobalParams->iGap->get( ) * 2 +
pGlobalParams->iExtend->get( ) * (iDeltaDistToPre + iDeltaDistToPost))
);

if( dDeltaDistDiff < dMaxDeltaDist && (uint64_t)iDeltaDistToPre > uiMinDeltaDist &&
iScoreSeed < iIndelPenalty * 2 )
{
// the filter triggered ->
// we flag the seed to be ignored by setting it's size to zero
Expand Down Expand Up @@ -255,6 +262,8 @@ std::shared_ptr<Seeds> Harmonization::harmonizeOne( std::shared_ptr<Seeds>& pSee
pSeeds->xStats = pSeedsIn->xStats;
if( pSeedsIn->size( ) > 1 )
{

DEBUG( pSoCIn->vSoCs.push_back( pSeedsIn ); ) // DEBUG
#if USE_RANSAC == 1 // switch between ransac line angle + intercept estimation & 45deg median line
std::vector<double> vX, vY;
vX.reserve( pSeedsIn->size( ) );
Expand All @@ -281,7 +290,8 @@ std::shared_ptr<Seeds> Harmonization::harmonizeOne( std::shared_ptr<Seeds>& pSee
pSeedsIn->erase( std::remove_if( pSeedsIn->begin( ), pSeedsIn->end( ),
[ & ]( const Seed& rS ) {
return deltaDistance( rS, xSlopeIntercept.first,
(int64_t)xSlopeIntercept.second ) > fMAD;
(int64_t)xSlopeIntercept.second ) > fMAD &&
rS.size() < 50;
} ),
pSeedsIn->end( ) );

Expand All @@ -301,8 +311,8 @@ std::shared_ptr<Seeds> Harmonization::harmonizeOne( std::shared_ptr<Seeds>& pSee
for( Seeds::iterator pSeed = pSeedsIn->begin( ); pSeed != pSeedsIn->end( ); pSeed++ )
{
pShadows->push_back( std::make_tuple( pSeed, pSeed->start( ), pSeed->end_ref( ) ) );
// std::cout << "(" << pSeed->start() << "," << pSeed->start_ref() << "," <<
// pSeed->size() << ")," << std::endl;
//std::cout << "1 (" << pSeed->start() << "," << pSeed->start_ref() << "," <<
//pSeed->size() << ")" << std::endl;
} // for

// perform the line sweep algorithm on the left shadows
Expand All @@ -312,16 +322,24 @@ std::shared_ptr<Seeds> Harmonization::harmonizeOne( std::shared_ptr<Seeds>& pSee

// get the right shadows
for( auto& xT : *pShadows2 )
{
pShadows->push_back(
std::make_tuple( std::get<0>( xT ), std::get<0>( xT )->start_ref( ), std::get<0>( xT )->end( ) ) );
//std::cout << "2 (" << std::get<0>( xT )->start_ref() << "," << std::get<0>( xT )->end() << "," <<
//std::get<0>( xT )->size() << ")" << std::endl;
}

// perform the line sweep algorithm on the right shadows
pShadows = linesweep( pShadows, (int64_t)xSlopeIntercept.second, xSlopeIntercept.first );

pSeeds->reserve( pShadows->size( ) );

for( auto& xT : *pShadows )
{
pSeeds->push_back( *std::get<0>( xT ) );
// std::cout << "3 (" << std::get<0>( xT )->start() << "," << std::get<0>( xT )->start_ref() << "," <<
// std::get<0>( xT )->size() << ")" << std::endl;
}

pSeeds->bConsistent = true;

Expand Down Expand Up @@ -367,7 +385,8 @@ std::shared_ptr<Seeds> Harmonization::harmonizeOne( std::shared_ptr<Seeds>& pSee
pSoCIn->vIngroup.push_back( std::make_shared<Seeds>( ) );
pSoCIn->vSlopes.push_back( 0 );
pSoCIn->vIntercepts.push_back( 0 );
pSoCIn->vHarmSoCs.push_back( std::make_shared<Seeds>( pSeeds ) ); ) // DEBUG
pSoCIn->vHarmSoCs.push_back( std::make_shared<Seeds>( pSeeds ) );
pSoCIn->vSoCs.push_back( std::make_shared<Seeds>( pSeeds ) ); ) // DEBUG
} // else
return pSeeds;
}
Expand Down
7 changes: 7 additions & 0 deletions libs/ms/inc/ms/util/parameter.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,7 @@ class Presetting : public ParameterSetBase
AlignerParameterPointer<int> xMaxFuzzinessFilter; // maximal fuzziness for sv calls
AlignerParameterPointer<int> xMaxSuppNtShortCallFilter; // maximal number of nt for short call low support filter
AlignerParameterPointer<int> xMaxCallSizeShortCallFilter; // maximal call size for short call low support filter
AlignerParameterPointer<int> xMaxRegionSizeForAmbiguityFilter; // max size of the
AlignerParameterPointer<int> xMaxRefAmbiguityJump; // Max Ref Ambiguity Jump
AlignerParameterPointer<int> xMMFilterMaxOcc; // xMMFilterMaxOcc
AlignerParameterPointer<int> xMinNtInSoc; // Min NT in SoC
Expand Down Expand Up @@ -802,6 +803,8 @@ class Presetting : public ParameterSetBase
xMaxSuppNtShortCallFilter( this, "Max Supp Nt", "- unused -", SV_PARAMETERS, 10, checkPositiveValue ),
xMaxCallSizeShortCallFilter( this, "Max Call Size Filter", "- unused -", SV_PARAMETERS, 20,
checkPositiveValue ),
xMaxRegionSizeForAmbiguityFilter( this, "Max Regionsize Ambiguity Filter", "", SV_PARAMETERS, 200,
checkPositiveValue ),
xMaxRefAmbiguityJump( this, "Max Ref Ambiguity Jump", "- unused -", SV_PARAMETERS, 10, checkPositiveValue ),
xMMFilterMaxOcc(
this, "Max Occ MM Filter",
Expand Down Expand Up @@ -1028,6 +1031,7 @@ class GlobalParameter : public ParameterSetBase
AlignerParameterPointer<int> iExtend;
AlignerParameterPointer<int> iGap2;
AlignerParameterPointer<int> iExtend2;
AlignerParameterPointer<int> xCoverageBinSize; // Coveragebin size
AlignerParameterPointer<int> uiSVPenalty;

/* Constructor */
Expand All @@ -1052,6 +1056,9 @@ class GlobalParameter : public ParameterSetBase
DP_PARAMETERS, 24, checkPositiveValue ),
iExtend2( this, "Second Extend Penalty", "Second penalty for gap extension. (Two piece affine gap costs)",
DP_PARAMETERS, 1, checkPositiveValue ),
xCoverageBinSize( this, "Coveragebin size",
"Binsize for storing the seed coverage on the genome",
SV_PARAMETERS, 100, checkPositiveValue ),
uiSVPenalty( this, "Pick Local Seed Set C - Maximal Gap Penalty",
"Maximal gap cost penalty during local seed set computation.", HEURISTIC_PARAMETERS, 100,
checkPositiveValue )
Expand Down
69 changes: 69 additions & 0 deletions libs/msv/inc/msv/container/sv_db/query_objects/coverageUpdater.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* @file coverageUpdater.h
* @brief
* @author Markus Schmidt
*/
#pragma once

#include "ma/container/seed.h"
#include "ms/module/get_inserter_container_module.h"
#include "msv/container/sv_db/tables/coverage.h"
#include "util/geom.h"

using namespace libMA;
using namespace libMS;

namespace libMSV
{
/**
* @brief Insertion of coverage into the database.
*/
template <typename DBCon>
class CoverageUpdaterContainer
: public InserterContainer<DBCon, AbstractInserterContainer, CoverageTable, Seeds>
{
public:
using ParentType =
InserterContainer<DBCon, libMS::AbstractInserterContainer, CoverageTable, Seeds>;
const size_t uiBinSize;


CoverageUpdaterContainer( std::shared_ptr<PoolContainer<DBCon>> pPool, int64_t iId,
std::shared_ptr<SharedInserterProfiler> pSharedProfiler )
: ParentType::InserterContainer( pPool, iId, pSharedProfiler ),
uiBinSize(pGlobalParams->xCoverageBinSize->get())
{} // constructur

protected:
virtual size_t insert_override( std::shared_ptr<Seeds> pSeeds )
{
std::set<int64_t> xCoveredBinIds;
for( Seed& rSeed : *pSeeds )
{
const nucSeqIndex uiStartBinId = rSeed.start_ref_cons_rev() / uiBinSize;
const nucSeqIndex uiEndBinId = 1 + (rSeed.end_ref_cons_rev() - 1) / uiBinSize;

for(nucSeqIndex uiX = uiStartBinId; uiX < uiEndBinId; uiX++)
xCoveredBinIds.insert(uiX);
} // for

for(int64_t uiBinId : xCoveredBinIds)
ParentType::pInserter->incCoverage( ParentType::iId /* <- id of the sv jump run */, uiBinId, (uint32_t)1 );
return xCoveredBinIds.size( );
} // method
}; // class

template <typename DBCon, typename DBConInit>
using GetCoverageUpdaterContainerModule =
GetInserterContainerModule<CoverageUpdaterContainer, DBCon, DBConInit, CoverageTable>;


template <typename DBCon> using CoverageUpdaterModule = InserterModule<CoverageUpdaterContainer<DBCon>>;


} // namespace libMSV

#ifdef WITH_PYTHON
/// @brief used to expose libMSV::CoverageUpdaterContainer to python
void exportCoverageUpdater( libMS::SubmoduleOrganizer& xOrganizer );
#endif
Loading

0 comments on commit a09a1f5

Please sign in to comment.