From e2cdf7cce7cedfe81f77fe1377f4ae23fe7e5346 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 21 Jan 2025 15:24:02 -0800 Subject: [PATCH] Get haplotype results for haplotypes that stop in the middle while others go on --- src/haplotype_extracter.cpp | 23 +++++++++++++++++------ src/haplotype_extracter.hpp | 8 ++------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/haplotype_extracter.cpp b/src/haplotype_extracter.cpp index 19bebfb996..c09a083972 100644 --- a/src/haplotype_extracter.cpp +++ b/src/haplotype_extracter.cpp @@ -232,16 +232,23 @@ vector, gbwt::SearchState> > list_haplotypes(const } }); + size_t continued_members = 0; + for (auto& nhs : next_handle_states) { const handle_t& next = get<0>(nhs); gbwt::node_type& extend_node = get<1>(nhs); gbwt::SearchState& new_state = get<2>(nhs); + + // Keep track of how many selected threads still exist in all the extensions. + continued_members += new_state.size(); vector new_thread; - if (&nhs == &next_handle_states.back()) { - // avoid a copy by re-using the vector for the last thread. this way simple cases - // like scanning along one path don't blow up to n^2 + if (&nhs == &next_handle_states.back() && continued_members == last.second.size()) { + // Avoid a copy by re-using the vector for the last thread if + // we don't need to split off or stop any more haplotypes. This + // way simple cases like scanning along one path don't blow up + // to n^2. new_thread = std::move(last.first); } else { new_thread = last.first; @@ -262,11 +269,15 @@ vector, gbwt::SearchState> > list_haplotypes(const } } - if (next_handle_states.empty()) { + if (continued_members != last.second.size()) { #ifdef debug - cerr << "Got no results for any extension of " << last.second << "; emitting" << endl; + cerr << "Stopped " (last.second.size() - continued_members) << " results here; emitting" << endl; #endif - search_results.emplace_back(std::move(last)); + // We need to make a result with *only* the stopping results. + // So we extend with a sentinel (0, false) GBWT node, which we don't put in our output vector for the haplotype. + // TODO: This is undocumented in gbwt but seems to work. + auto new_state = gbwt.extend(last.second, gbwt::Node::encode(0, false)); + search_results.emplace_back(std::move(last.first), new_state); } } diff --git a/src/haplotype_extracter.hpp b/src/haplotype_extracter.hpp index 5da7f511bd..ffd105ddd5 100644 --- a/src/haplotype_extracter.hpp +++ b/src/haplotype_extracter.hpp @@ -26,9 +26,7 @@ using thread_t = vector; // from the GBWT index. // out_graph may already contain nodes and paths. // If stop_fn returns True, the haplotype extraction is stopped for that haplotype. -// Haplotypes are emitted when they are stopped or when they cannot be extended any more. -// -// TODO: Should results be produced when some haplotypes end but others keep going? +// Haplotypes are emitted when they are stopped or when a sample cannot be continued anymore. void trace_haplotypes(const PathHandleGraph& source, const gbwt::GBWT& haplotype_database, const handle_t& start_handle, function&)> stop_fn, @@ -49,11 +47,9 @@ Path path_from_thread_t(thread_t& t, const HandleGraph& source); // Lists all the sub-haplotypes of nodes starting at // start from the set of haplotypes embedded in the given GBWT // haplotype database. At each step stop_fn() is called on the thread being created, and if it returns true -// then the search stops. The search will also stop and emit a result if it runs out of places to go. +// then the search stops. The search will also emit a result if any selected sample stops. // // No empty sub-haplotypes will be returned. -// -// TODO: Should results be produced when some haplotypes end but others keep going? vector, gbwt::SearchState> > list_haplotypes(const HandleGraph& graph, const gbwt::GBWT& gbwt, handle_t start,