Skip to content

Commit

Permalink
Merge pull request #4471 from vgteam/flexible-exon-numbering
Browse files Browse the repository at this point in the history
Flexible exon numbering
  • Loading branch information
adamnovak authored Dec 9, 2024
2 parents 127e557 + 9711c5a commit d5859e1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 70 deletions.
93 changes: 25 additions & 68 deletions src/transcriptome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,11 +582,8 @@ int32_t Transcriptome::parse_transcripts(vector<Transcript> * transcripts, uint3
string feature;
string pos;
string strand;
string attributes;
string attribute;

bool zero_based_exon_number = false;

while (transcript_stream->good()) {

line_number += 1;
Expand Down Expand Up @@ -650,8 +647,6 @@ int32_t Transcriptome::parse_transcripts(vector<Transcript> * transcripts, uint3
transcript_line_ss.ignore(numeric_limits<streamsize>::max(), '\t');

string transcript_id = "";
int32_t exon_number = -1;

while (getline(transcript_line_ss, attribute, ';')) {

if (attribute.empty()) {
Expand All @@ -665,39 +660,7 @@ int32_t Transcriptome::parse_transcripts(vector<Transcript> * transcripts, uint3
transcript_id = parse_attribute_value(attribute, transcript_tag);
}

// Parse exon number.
if (exon_number < 0) {

auto exon_number_str = parse_attribute_value(attribute, "exon_number");

if (exon_number_str.empty()) {

// If not exon_number attribute try ID.
auto exon_id = parse_attribute_value(attribute, "ID");

if (count(exon_id.begin(), exon_id.end(), ':') == 2) {

auto exon_id_ss = stringstream(exon_id);

string element;
getline(exon_id_ss, element, ':');

if (element == "exon") {

getline(exon_id_ss, element, ':');
getline(exon_id_ss, element);

exon_number = stoi(element);
}
}

} else {

exon_number = stoi(exon_number_str);
}
}

if (!transcript_id.empty() && exon_number >= 0) {
if (!transcript_id.empty()) {

break;
}
Expand Down Expand Up @@ -728,25 +691,18 @@ int32_t Transcriptome::parse_transcripts(vector<Transcript> * transcripts, uint3
// Add exon to current transcript.
add_exon(transcript, make_pair(spos, epos), graph_path_pos_overlay);
}
}

// Check if exons are in correct order in file.
if (exon_number >= 0) {

// If first transcript and exon, set whether exon numbering is zero-based.
if (parsed_transcripts.size() == 1 && transcript->exons.size() == 1) {
for (auto & transcript: parsed_transcripts) {
// Reorder reversed order exons.
reorder_exons(&(transcript.second));

zero_based_exon_number = (exon_number == 0) ? true : false;
}
// Exclude transcripts with exons in incorrect order according to bp.
if (has_incorrect_order_exons(transcript.second.exons)) {

if (transcript->exons.size() - static_cast<uint32_t>(zero_based_exon_number) != exon_number) {

// Exclude transcripts with exons in incorrect order according to attributes.
excluded_transcripts.emplace(transcript_id);
}
excluded_transcripts.emplace(transcript.first);
}
}

for (auto & transcript: parsed_transcripts) {

// Exclude transcripts with overlapping exons.
if (has_overlapping_exons(transcript.second.exons)) {
Expand All @@ -759,13 +715,11 @@ int32_t Transcriptome::parse_transcripts(vector<Transcript> * transcripts, uint3

transcripts->reserve(transcripts->size() + parsed_transcripts.size() - excluded_transcripts.size());

// Populate transcripts with parsed_transcripts not in excluded_transcripts.
for (auto & transcript: parsed_transcripts) {

if (excluded_transcripts.find(transcript.first) == excluded_transcripts.end()) {

// Reorder reversed order exons.
reorder_exons(&(transcript.second));

transcripts->emplace_back(move(transcript.second));
}
}
Expand Down Expand Up @@ -884,7 +838,7 @@ void Transcriptome::reorder_exons(Transcript * transcript) const {

if (transcript->is_reverse) {

// Is exons in reverse order.
// Are exons in reverse order?
bool is_reverse_order = true;
for (size_t i = 1; i < transcript->exons.size(); i++) {

Expand All @@ -905,21 +859,24 @@ void Transcriptome::reorder_exons(Transcript * transcript) const {
bool Transcriptome::has_overlapping_exons(const vector<Exon> & exons) const {

for (size_t i = 1; i < exons.size(); ++i) {
// Assumes that exons are in increasing coordinate order.
if (exons.at(i - 1).coordinates.second >= exons.at(i).coordinates.first) {

return true;
}
}

// Is exons in reverse order.
if (exons.at(i - 1).coordinates.first <= exons.at(i).coordinates.first) {

if (exons.at(i - 1).coordinates.second >= exons.at(i).coordinates.first) {

return true;
}

} else {
return false;
}

if (exons.at(i).coordinates.second >= exons.at(i - 1).coordinates.first) {
bool Transcriptome::has_incorrect_order_exons(const vector<Exon> & exons) const {

return true;
}
for (size_t i = 1; i < exons.size(); ++i) {
// Assumes that exons are in increasing coordinate order.
if (exons.at(i - 1).coordinates.first > exons.at(i).coordinates.first
|| exons.at(i - 1).coordinates.second > exons.at(i).coordinates.second) {

return true;
}
}

Expand Down
5 changes: 5 additions & 0 deletions src/transcriptome.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,12 @@ class Transcriptome {
/// are ordered in reverse.
void reorder_exons(Transcript * transcript) const;

/// Checks whether any adjacent exons are out of order
/// Assumes exons are in increasing order (to be correct)
bool has_incorrect_order_exons(const vector<Exon> & exons) const;

/// Checks whether any adjacent exons overlap.
/// Assumes exons are in increasing order
bool has_overlapping_exons(const vector<Exon> & exons) const;

/// Constructs edited reference transcript paths from a set of
Expand Down
7 changes: 5 additions & 2 deletions src/unittest/transcriptome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,13 +499,16 @@ namespace vg {
SECTION("Transcriptome can parse gff3 file and exclude transcripts with incorrect exon order") {

stringstream transcript_stream2;
transcript_stream2 << "path1\t.\texon\t2\t7\t.\t+\t.\ttranscript_id=transcript1;exon_number=2" << endl;
transcript_stream2 << "path1\t.\texon\t9\t10\t.\t+\t.\ttranscript_id=transcript1;exon_number=1" << endl;
transcript_stream2 << "path1\t.\texon\t9\t10\t.\t+\t.\ttranscript_id=transcript1;exon_number=2" << endl;
transcript_stream2 << "path1\t.\texon\t2\t7\t.\t+\t.\ttranscript_id=transcript1;exon_number=1" << endl;
transcript_stream2 << "path1\t.\texon\t19\t21\t.\t+\t.\ttranscript_id=transcript1;exon_number=3" << endl;
transcript_stream2 << "path1\t.\texon\t16\t21\t.\t-\t.\tID=exon:transcript2:0;transcript_id=transcript2;" << endl;
transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl;
transcript_stream2 << "path1\t.\texon\t8\t9\t.\t-\t.\tID=exon:transcript2:2;transcript_id=transcript2" << endl;
transcript_stream2 << "path1\t.\texon\t9\t11\t.\t+\t.\texon_number=1;transcript_id=transcript3" << endl;
transcript_stream2 << "path1\t.\texon\t18\t21\t.\t+\t.\texon_number=2;transcript_id=transcript3" << endl;
transcript_stream2 << "path1\t.\texon\t8\t10\t.\t+\t.\ttranscript_id=transcript4" << endl;
transcript_stream2 << "path1\t.\texon\t3\t6\t.\t+\t.\ttranscript_id=transcript4" << endl;

transcriptome.add_reference_transcripts(vector<istream *>({&transcript_stream2}), empty_haplotype_index, false, false);

Expand Down

1 comment on commit d5859e1

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17528 seconds

Please sign in to comment.