Skip to content
This repository has been archived by the owner on Jul 17, 2023. It is now read-only.

Commit

Permalink
MANTA-267 Support for contig names with colons.
Browse files Browse the repository at this point in the history
This change is motivated by HLA contigs in the 1kg flavor of
hg38. To support this change all region parsing has been
unified to the htsapi library in manta (and Manta's python region
parser). We aren't using the htslib parser anymore.

The new behavior is to split on the last colon. If a colon is found,
then evaluate the second field as a begin-end position pair. If the
string can't be parsed this way then assume the region following the
last colon is also part of the contig name.

Note this means that the samtools "contig:begin" format without listing
an end position is no longer supported.
  • Loading branch information
ctsa committed Sep 27, 2015
1 parent b49d786 commit decec17
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 85 deletions.
1 change: 1 addition & 0 deletions ChangeLog.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-v0.29.0
- MANTA-267 Support contig names with colons (for HLA contigs in 1kg hg38)
- MANTA-252 Complete support for CRAM input
- MANTA-264 Remove samtools from manta dependencies
- MANTA-252 Change default chrom depth to median estimate from alignments
Expand Down
3 changes: 1 addition & 2 deletions src/c++/lib/applications/DumpSVLoci/DumpSVLoci.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ static
void
runDSL(const DSLOptions& opt)
{

SVLocusSet set;
set.load(opt.graphFilename.c_str());

Expand All @@ -57,7 +56,7 @@ runDSL(const DSLOptions& opt)
if (! opt.region.empty())
{
int32_t tid,beginPos,endPos;
parse_bam_region(set.header, opt.region, tid, beginPos, endPos); // parse the region
parse_bam_region(set.header, opt.region.c_str(), tid, beginPos, endPos);

set.dumpRegion(os,GenomeInterval(tid,beginPos,endPos));
}
Expand Down
2 changes: 1 addition & 1 deletion src/c++/lib/applications/EstimateSVLoci/EstimateSVLoci.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ runESLRegion(
const bam_header_info bamHeader(header);

int32_t tid(0), beginPos(0), endPos(0);
parse_bam_region(bamHeader,region,tid,beginPos,endPos);
parse_bam_region(bamHeader,region.c_str(),tid,beginPos,endPos);

const GenomeInterval scanRegion(tid,beginPos,endPos);
#ifdef DEBUG_ESL
Expand Down
133 changes: 101 additions & 32 deletions src/c++/lib/htsapi/bam_header_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,68 +30,137 @@
#include <cassert>

#include <algorithm>
#include <limits>
#include <sstream>



void
parse_bam_region(
const bam_header_info& header,
const std::string& region,
int32_t& tid,
const char* region,
std::string& chrom,
int32_t& begin_pos,
int32_t& end_pos)
{
static const char region_sep1(':');
std::vector<std::string> words;
split_string(region,region_sep1,words);

if (words.empty() || words[0].empty() || (words.size() > 2))
// make first split:
const char* afterChrom;
{
std::ostringstream oss;
oss << "ERROR: can't parse bam_region [err 1] " << region << "\n";
throw blt_exception(oss.str().c_str());
static const char region_sep1(':');
afterChrom = strrchr(region,region_sep1);
if (nullptr != afterChrom)
{
chrom=std::string(region,afterChrom);
assert((*afterChrom) != '\0');
afterChrom++;
}
}

bool isFound(false);
const unsigned n_chroms(header.chrom_data.size());
for (unsigned i(0); i<n_chroms; ++i)

bool isWholeChrom(nullptr == afterChrom);

if (! isWholeChrom)
{
if (words[0]==header.chrom_data[i].label)
// make second split
static const char region_sep2('-');
std::vector<std::string> words2;
split_string(afterChrom,region_sep2,words2);

if (words2.empty() || (words2.size() > 2))
{
std::ostringstream oss;
oss << "ERROR: can't parse begin and end positions from bam_region '" << region << "'\n";
throw blt_exception(oss.str().c_str());
}

if (words2.size() == 2)
{
tid=i;
isFound=true;
break;
begin_pos = (illumina::blt_util::parse_int_str(words2[0]))-1;
end_pos = (illumina::blt_util::parse_int_str(words2[1]));
}
else
{
// this exception allows for chrom names with colons (HLA...) but no positions included
isWholeChrom=true;
}
}

if (! isFound)
if (isWholeChrom)
{
chrom=region;
begin_pos = 0;
end_pos = std::numeric_limits<int32_t>::max();

}

if (chrom.empty())
{
std::ostringstream oss;
oss << "ERROR: can't parse bam_region [err 2] " << region << "\n"
<< "\tchromosome: '" << words[0] << "' not found in header\n";
oss << "ERROR: can't parse contig name from bam_region '" << region << "'\n";
throw blt_exception(oss.str().c_str());
}

begin_pos = 0;
end_pos = header.chrom_data[tid].length;
if (1 == words.size()) return;
if ((begin_pos<0) || (end_pos<0) || (end_pos<=begin_pos))
{
std::ostringstream oss;
oss << "ERROR: nonsensical begin (" << begin_pos << ") and end (" << end_pos << ") positions parsed from bam_region '" << region << "'\n";
throw blt_exception(oss.str().c_str());
}
}


static const char region_sep2('-');
std::vector<std::string> words2;
split_string(words[1],region_sep2,words2);

if (words2.empty() || (words2.size() > 2))
void
parse_bam_region_from_hdr(
const bam_hdr_t* header,
const char* region,
int32_t& tid,
int32_t& begin_pos,
int32_t& end_pos)
{
assert(nullptr != header);
assert(nullptr != region);

std::string chrom;
parse_bam_region(region,chrom,begin_pos,end_pos);

tid = bam_name2id(const_cast<bam_hdr_t*>(header),chrom.c_str());

if (tid < 0)
{
std::ostringstream oss;
oss << "ERROR: can't parse bam_region [err 3] " << region << "\n";
oss << "ERROR: contig '" << chrom << "' from bam_region '" << region << "' not found in BAM/CRAM header\n";
throw blt_exception(oss.str().c_str());
}

begin_pos = (illumina::blt_util::parse_int_str(words2[0]))-1;
end_pos = std::min(end_pos,static_cast<int32_t>(header->target_len[tid]));
}



void
parse_bam_region(
const bam_header_info& header,
const char* region,
int32_t& tid,
int32_t& begin_pos,
int32_t& end_pos)
{
assert(nullptr != region);

std::string chrom;
parse_bam_region(region,chrom,begin_pos,end_pos);

const auto citer(header.chrom_to_index.find(chrom));

if (citer == header.chrom_to_index.end())
{
std::ostringstream oss;
oss << "ERROR: contig '" << chrom << "' from bam_region '" << region << "' not found in BAM/CRAM header\n";
throw blt_exception(oss.str().c_str());
}

if (1 == words2.size()) return;
end_pos = (illumina::blt_util::parse_int_str(words2[1]));
tid = citer->second;
end_pos = std::min(end_pos,static_cast<int32_t>(header.chrom_data[tid].length));
}


Expand Down
24 changes: 23 additions & 1 deletion src/c++/lib/htsapi/bam_header_util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,32 @@
#include <string>


/// parse a bam region into chrom/begin/end values
///
void
parse_bam_region(
const char* region,
std::string& chrom,
int32_t& begin_pos,
int32_t& end_pos);


/// parse a bam region into chrom-index/begin/end values based
/// on chromosome index lookup and end positions in bam header
///
void
parse_bam_region_from_hdr(
const bam_hdr_t* header,
const char* region,
int32_t& tid,
int32_t& begin_pos,
int32_t& end_pos);


void
parse_bam_region(
const bam_header_info& header,
const std::string& region,
const char* region,
int32_t& tid,
int32_t& begin_pos,
int32_t& end_pos);
Expand Down
14 changes: 7 additions & 7 deletions src/c++/lib/htsapi/bam_streamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "blt_util/blt_exception.hh"
#include "blt_util/log.hh"
#include "htsapi/bam_header_util.hh"
#include "htsapi/bam_streamer.hh"

#include <cassert>
Expand Down Expand Up @@ -78,11 +79,10 @@ bam_streamer(

if (_hdr->n_targets)
{
// parse a fake region so that header->hash is created
std::string fake_region(target_id_to_name(0));
fake_region += ":1-1";
int ref,beg,end;
bam_parse_region2(_hdr, fake_region.c_str(), ref, beg, end);
// parse any contig name so that header->hash is created
// ignore returned tid value, so doesn't matter if fake name
// exists
target_name_to_id("fake_name");
}
return;
}
Expand Down Expand Up @@ -175,8 +175,8 @@ void
bam_streamer::
set_new_region(const char* region)
{
int ref,beg,end;
bam_parse_region2(_hdr, region, ref, beg, end); // parse the region
int32_t ref,beg,end;
parse_bam_region_from_hdr(_hdr, region, ref, beg, end);

try
{
Expand Down
27 changes: 0 additions & 27 deletions src/c++/lib/htsapi/bam_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,30 +175,3 @@ bam_aux_append_unsigned(bam1_t& br,
bam_aux_append(&br,tag,'C',1,&z);
}
}


void
bam_parse_region2(
const bam_hdr_t* header,
const char* str,
int& ref_id, int& beg, int& end)
{
const char* name_lim = hts_parse_reg(str, &beg, &end);
if (name_lim)
{
const std::string name(str,name_lim);
ref_id = bam_name2id(const_cast<bam_hdr_t*>(header), name.c_str());
}
else
{
// not parsable as a region, but possibly a sequence named "foo:a"
ref_id = bam_name2id(const_cast<bam_hdr_t*>(header), str);
beg = 0;
end = std::numeric_limits<int>::max();
}
if ((ref_id == -1) || (beg > end))
{
log_os << "ERROR: failed to parse samtools region string: '" << str << "'\n";
exit(EXIT_FAILURE);
}
}
7 changes: 0 additions & 7 deletions src/c++/lib/htsapi/bam_util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,6 @@ static inline uint32_t bam_calend(const bam1_core_t* c, const uint32_t* cigar)
}


void
bam_parse_region2(
const bam_hdr_t* header,
const char* str,
int& ref_id, int& beg, int& end);


namespace BAM_FLAG
{
enum index_t
Expand Down
21 changes: 13 additions & 8 deletions src/python/lib/workflowUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ def parseGenomeRegion(regionStr) :

assert(regionStr is not None)

word=regionStr.strip().split(':')
word=regionStr.strip().rsplit(':',1)

if (len(word) < 1) or (len(word) > 2) :
if len(word) < 1 :
raise Exception("Unexpected format in genome region string: %s" % (regionStr))

chrom=word[0]
Expand All @@ -106,14 +106,19 @@ def parseGenomeRegion(regionStr) :
end=None

if (len(word) > 1) :
rangeWord=word[1].split('-')
if len(rangeWord) != 2 :
if len(word[1]) == 0 :
raise Exception("Unexpected format in genome region string: %s" % (regionStr))
start = int(rangeWord[0])
end = int(rangeWord[1])

if (end < start) or (start < 1) or (end < 1) :
raise Exception("Unexpected format in genome region string: %s" % (regionStr))
rangeWord=word[1].split('-')
if len(rangeWord) != 2 :
# assume this might be an HLA chrom at this point:
chrom=regionStr.strip()
else :
start = int(rangeWord[0])
end = int(rangeWord[1])

if (end < start) or (start < 1) or (end < 1) :
raise Exception("Unexpected format in genome region string: %s" % (regionStr))

return {"chrom":chrom, "start":start, "end":end}

Expand Down

0 comments on commit decec17

Please sign in to comment.