Skip to content

Commit

Permalink
Get haplotype results for haplotypes that stop in the middle while ot…
Browse files Browse the repository at this point in the history
…hers go on
  • Loading branch information
adamnovak committed Jan 21, 2025
1 parent b19e7fc commit e2cdf7c
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
23 changes: 17 additions & 6 deletions src/haplotype_extracter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,16 +232,23 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const
}
});

size_t continued_members = 0;

for (auto& nhs : next_handle_states) {

const handle_t& next = get<0>(nhs);
gbwt::node_type& extend_node = get<1>(nhs);
gbwt::SearchState& new_state = get<2>(nhs);

// Keep track of how many selected threads still exist in all the extensions.
continued_members += new_state.size();

vector<gbwt::node_type> new_thread;
if (&nhs == &next_handle_states.back()) {
// avoid a copy by re-using the vector for the last thread. this way simple cases
// like scanning along one path don't blow up to n^2
if (&nhs == &next_handle_states.back() && continued_members == last.second.size()) {
// Avoid a copy by re-using the vector for the last thread if
// we don't need to split off or stop any more haplotypes. This
// way simple cases like scanning along one path don't blow up
// to n^2.
new_thread = std::move(last.first);
} else {
new_thread = last.first;
Expand All @@ -262,11 +269,15 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const
}
}

if (next_handle_states.empty()) {
if (continued_members != last.second.size()) {
#ifdef debug
cerr << "Got no results for any extension of " << last.second << "; emitting" << endl;
cerr << "Stopped " (last.second.size() - continued_members) << " results here; emitting" << endl;
#endif
search_results.emplace_back(std::move(last));
// We need to make a result with *only* the stopping results.
// So we extend with a sentinel (0, false) GBWT node, which we don't put in our output vector for the haplotype.
// TODO: This is undocumented in gbwt but seems to work.
auto new_state = gbwt.extend(last.second, gbwt::Node::encode(0, false));
search_results.emplace_back(std::move(last.first), new_state);
}

}
Expand Down
8 changes: 2 additions & 6 deletions src/haplotype_extracter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ using thread_t = vector<gbwt::node_type>;
// from the GBWT index.
// out_graph may already contain nodes and paths.
// If stop_fn returns True, the haplotype extraction is stopped for that haplotype.
// Haplotypes are emitted when they are stopped or when they cannot be extended any more.
//
// TODO: Should results be produced when some haplotypes end but others keep going?
// Haplotypes are emitted when they are stopped or when a sample cannot be continued anymore.
void trace_haplotypes(const PathHandleGraph& source,
const gbwt::GBWT& haplotype_database,
const handle_t& start_handle, function<bool(const vector<gbwt::node_type>&)> stop_fn,
Expand All @@ -49,11 +47,9 @@ Path path_from_thread_t(thread_t& t, const HandleGraph& source);
// Lists all the sub-haplotypes of nodes starting at
// start from the set of haplotypes embedded in the given GBWT
// haplotype database. At each step stop_fn() is called on the thread being created, and if it returns true
// then the search stops. The search will also stop and emit a result if it runs out of places to go.
// then the search stops. The search will also emit a result if any selected sample stops.
//
// No empty sub-haplotypes will be returned.
//
// TODO: Should results be produced when some haplotypes end but others keep going?
vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const HandleGraph& graph,
const gbwt::GBWT& gbwt,
handle_t start,
Expand Down

0 comments on commit e2cdf7c

Please sign in to comment.