Skip to content

Commit

Permalink
Merge pull request #4298 from vgteam/deconstruct
Browse files Browse the repository at this point in the history
Refactor deconstruct to use less protobuf
  • Loading branch information
glennhickey authored May 22, 2024
2 parents 8ece064 + f29b05f commit f54e633
Show file tree
Hide file tree
Showing 15 changed files with 540 additions and 504 deletions.
8 changes: 0 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -396,14 +396,6 @@ vg index hla.vg -x hla.xg
vg deconstruct hla.xg -e -p "gi|568815592:29791752-29792749" > hla_variants.vcf
```

Variants can also be inferred strictly from topology by not using `-e`, though unlike the above example, cycles are not supported. "Deconstruct" the VCF variants that were used to construct the graph. The output will be similar but identical to `small/x.vcf.gz` as `vg construct` can add edges between adjacent alts and/or do some normalization:

<!-- !test check Deconstruct from construct -->
```sh
# using the same graph from the `map` example
vg deconstruct x.xg -p x > x.vcf
```

Haplotype paths from `.gbz` or `.gbwt` indexes input can be considered using `-z` and `-g', respectively.

As with `vg call`, it is best to compute snarls separately and pass them in with `-r` when working with large graphs.
Expand Down
2 changes: 1 addition & 1 deletion src/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector<R
}
path_name_set.clear();
graph_path_name_set.clear();
PathTraversalFinder trav_finder(*graph, snarl_manager, path_names);
PathTraversalFinder trav_finder(*graph, path_names);

// Do the top-level snarls, the recurse as needed (framework copied from deconstructor.cpp)
snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) {
Expand Down
670 changes: 303 additions & 367 deletions src/deconstructor.cpp

Large diffs are not rendered by default.

61 changes: 26 additions & 35 deletions src/deconstructor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@ class Deconstructor : public VCFOutputCaller {

// deconstruct the entire graph to cout.
// Not even a little bit thread safe.
void deconstruct(vector<string> refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager,
bool path_restricted_traversals,
int ploidy,
void deconstruct(vector<string> refpaths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager,
bool include_nested,
int context_jaccard_window,
bool untangle_traversals,
Expand All @@ -51,14 +49,34 @@ class Deconstructor : public VCFOutputCaller {

private:

// initialize the vcf and get the header
string get_vcf_header();

// deconstruct all snarls in parallel (ie nesting relationship ignored)
void deconstruct_graph(SnarlManager* snarl_manager);

// deconstruct all top-level snarls in parallel
// nested snarls are processed after their parents in the same thread
// (same logic as vg call)
void deconstruct_graph_top_down(SnarlManager* snarl_manager);

// write a vcf record for the given site. returns true if a record was written
// (need to have a path going through the site)
bool deconstruct_site(const Snarl* site) const;
bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const;

// get the traversals for a given site
// this returns a combination of embedded path traversals and gbwt traversals
// the embedded paths come first, and only they get trav_steps.
// so you can use trav_steps.size() to find the index of the first gbwt traversal...
void get_traversals(const handle_t& snarl_start, const handle_t& snarl_end,
vector<Traversal>& out_travs,
vector<string>& out_trav_path_names,
vector<pair<step_handle_t, step_handle_t>>& out_trav_steps) const;

// convert traversals to strings. returns mapping of traversal (offset in travs) to allele
vector<int> get_alleles(vcflib::Variant& v,
const pair<vector<SnarlTraversal>,
vector<pair<step_handle_t, step_handle_t>>>& path_travs,
const vector<Traversal>& travs,
const vector<pair<step_handle_t, step_handle_t>>& trav_steps,
int ref_path_idx,
const vector<bool>& use_trav,
char prev_char, bool use_start) const;
Expand All @@ -74,19 +92,6 @@ class Deconstructor : public VCFOutputCaller {
const vector<string>& trav_to_name,
const vector<int>& gbwt_phases) const;

// check to see if a snarl is too big to exhaustively traverse
bool check_max_nodes(const Snarl* snarl) const;

// get traversals from the exhaustive finder. if they have nested visits, fill them in (exhaustively)
// with node visits
vector<SnarlTraversal> explicit_exhaustive_traversals(const Snarl* snarl) const;

// gets a sorted node id context for a given path
vector<nid_t> get_context(
const pair<vector<SnarlTraversal>,
vector<pair<step_handle_t, step_handle_t>>>& path_travs,
const int& trav_idx) const;

// the underlying context-getter
vector<nid_t> get_context(
step_handle_t start_step,
Expand All @@ -101,22 +106,14 @@ class Deconstructor : public VCFOutputCaller {
const dac_vector<>& target,
const vector<nid_t>& query) const;

// toggle between exhaustive and path restricted traversal finder
bool path_restricted = false;

// the max ploidy we expect.
int ploidy;

// the graph
const PathPositionHandleGraph* graph;

// the snarl manager
SnarlManager* snarl_manager;
// the gbwt
gbwt::GBWT* gbwt;

// the traversal finders. we always use a path traversal finder to get the reference path
unique_ptr<PathTraversalFinder> path_trav_finder;
// we optionally use another (exhaustive for now) traversal finder if we don't want to rely on paths
unique_ptr<TraversalFinder> trav_finder;
// we can also use a gbwt for traversals
unique_ptr<GBWTTraversalFinder> gbwt_trav_finder;
// When using the gbwt we need some precomputed information to ask about stored paths.
Expand All @@ -143,9 +140,6 @@ class Deconstructor : public VCFOutputCaller {
// the sample ploidys given in the phases in our path names
unordered_map<string, int> sample_ploidys;

// upper limit of degree-2+ nodes for exhaustive traversal
int max_nodes_for_exhaustive = 100;

// target window size for determining the correct reference position for allele traversals with path jaccard
int path_jaccard_window = 10000;

Expand All @@ -160,9 +154,6 @@ class Deconstructor : public VCFOutputCaller {

// should we keep conflicted genotypes or not
bool keep_conflicted_genotypes = false;

// warn about context jaccard not working with exhaustive traversals
mutable atomic<bool> exhaustive_jaccard_warning;
};

// helpel for measuring set intersectiond and union size
Expand Down
23 changes: 23 additions & 0 deletions src/graph_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,17 @@ void VCFOutputCaller::set_nested(bool nested) {
include_nested = nested;
}

void VCFOutputCaller::add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, const Traversal& trav,
bool reversed, bool one_based) const {
SnarlTraversal proto_trav;
for (const handle_t& handle : trav) {
Visit* visit = proto_trav.add_visit();
visit->set_node_id(graph->get_id(handle));
visit->set_backward(graph->get_is_reverse(handle));
}
this->add_allele_path_to_info(v, allele, proto_trav, reversed, one_based);
}

void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav,
bool reversed, bool one_based) const {
auto& trav_info = v.info["AT"];
Expand Down Expand Up @@ -726,6 +737,18 @@ void VCFOutputCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool
}
}

string VCFOutputCaller::print_snarl(const HandleGraph* graph, const handle_t& snarl_start,
const handle_t& snarl_end, bool in_brackets) const {
Snarl snarl;
Visit* start = snarl.mutable_start();
start->set_node_id(graph->get_id(snarl_start));
start->set_backward(graph->get_is_reverse(snarl_start));
Visit* end = snarl.mutable_end();
end->set_node_id(graph->get_id(snarl_end));
end->set_backward(graph->get_is_reverse(snarl_end));
return this->print_snarl(snarl, in_brackets);
}

string VCFOutputCaller::print_snarl(const Snarl& snarl, bool in_brackets) const {
// todo, should we canonicalize here by putting lexicographic lowest node first?
nid_t start_node_id = snarl.start().node_id();
Expand Down
6 changes: 6 additions & 0 deletions src/graph_caller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,12 @@ class VCFOutputCaller {
protected:

/// add a traversal to the VCF info field in the format of a GFA W-line or GAF path
void add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele,
const Traversal& trav, bool reversed, bool one_based) const;
/// legacy version of above
void add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const;


/// convert a traversal into an allele string
string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const;

Expand All @@ -131,6 +135,8 @@ class VCFOutputCaller {

/// print a snarl in a consistent form like >3435<12222
/// if in_brackets set to true, do (>3435<12222) instead (this is only used for nested caller)
string print_snarl(const HandleGraph* grpah, const handle_t& snarl_start, const handle_t& snarl_end, bool in_brackets = false) const;
/// legacy version of above
string print_snarl(const Snarl& snarl, bool in_brackets = false) const;

/// do the opposite of above
Expand Down
2 changes: 1 addition & 1 deletion src/mcmc_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ namespace vg {
return false;
}

PathTraversalFinder trav_finder(*path_position_handle_graph, snarl_manager, ref_paths);
PathTraversalFinder trav_finder(*path_position_handle_graph, ref_paths);
auto trav_results = trav_finder.find_path_traversals(snarl);
vector<SnarlTraversal> ref_path = trav_results.first;

Expand Down
25 changes: 5 additions & 20 deletions src/subcommand/deconstruct_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,10 @@ void help_deconstruct(char** argv){
<< " -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed)." << endl
<< " Other non-ref paths not considered as samples. " << endl
<< " -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing." << endl
<< " -g, --gbwt FILE only consider alt traversals that correspond to GBWT threads FILE (not needed for GBZ graph input)." << endl
<< " -g, --gbwt FILE consider alt traversals that correspond to GBWT haplotypes in FILE (not needed for GBZ graph input)." << endl
<< " -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output" << endl
<< " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl
<< " -e, --path-traversals Only consider traversals that correspond to paths in the graph." << endl
<< " -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported)." << endl
<< " -d, --ploidy N Expected ploidy. If more traversals found, they will be flagged as conflicts (default: 2)" << endl
<< " -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000)." << endl
<< " -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field)." << endl
<< " -K, --keep-conflicted Retain conflicted genotypes in output." << endl
Expand All @@ -73,8 +71,6 @@ int main_deconstruct(int argc, char** argv){
bool gbz_translation = false;
bool path_restricted_traversals = false;
bool show_progress = false;
int ploidy = 2;
bool set_ploidy = false;
bool all_snarls = false;
bool keep_conflicted = false;
bool strict_conflicts = false;
Expand All @@ -96,7 +92,6 @@ int main_deconstruct(int argc, char** argv){
{"translation", required_argument, 0, 'T'},
{"gbz-translation", no_argument, 0, 'O'},
{"path-traversals", no_argument, 0, 'e'},
{"ploidy", required_argument, 0, 'd'},
{"context-jaccard", required_argument, 0, 'c'},
{"untangle-travs", no_argument, 0, 'u'},
{"all-snarls", no_argument, 0, 'a'},
Expand Down Expand Up @@ -140,12 +135,12 @@ int main_deconstruct(int argc, char** argv){
gbz_translation = true;
break;
case 'e':
path_restricted_traversals = true;
cerr << "Warning [vg deconstruct]: -e is deprecated as it's now on default" << endl;
break;
case 'd':
ploidy = parse<int>(optarg);
set_ploidy = true;
cerr << "Warning [vg deconstruct]: -d is deprecated - ploidy now inferred from haplotypes in path names" << endl;
break;
break;
case 'c':
context_jaccard_window = parse<int>(optarg);
break;
Expand Down Expand Up @@ -206,16 +201,6 @@ int main_deconstruct(int argc, char** argv){
cerr << "Error [vg deconstruct]: -O can only be used when input graph is in GBZ format" << endl;
}

if (set_ploidy && !path_restricted_traversals && gbwt_file_name.empty() && !gbz_graph) {
cerr << "Error [vg deconstruct]: -d can only be used with -e or -g or GBZ input" << endl;
return 1;
}

if ((!gbwt_file_name.empty() || gbz_graph) && path_restricted_traversals && !gbz_graph) {
cerr << "Error [vg deconstruct]: -e cannot be used with -g or GBZ input" << endl;
return 1;
}

if (!gbwt_file_name.empty() || gbz_graph) {
// context jaccard depends on having steps for each alt traversal, which is
// not something we have on hand when getting traversals from the GBWT/GBZ
Expand Down Expand Up @@ -352,7 +337,7 @@ int main_deconstruct(int argc, char** argv){
}
dd.set_translation(translation.get());
dd.set_nested(all_snarls);
dd.deconstruct(refpaths, graph, snarl_manager.get(), path_restricted_traversals, ploidy,
dd.deconstruct(refpaths, graph, snarl_manager.get(),
all_snarls,
context_jaccard_window,
untangle_traversals,
Expand Down
2 changes: 1 addition & 1 deletion src/subcommand/snarls_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ int main_snarl(int argc, char** argv) {

if (path_traversals) {
// Limit traversals to embedded paths
trav_finder.reset(new PathTraversalFinder(*graph, snarl_manager));
trav_finder.reset(new PathTraversalFinder(*graph));
} else if (vcf_filename.empty()) {
// This finder works with any backing graph
trav_finder.reset(new ExhaustiveTraversalFinder(*graph, snarl_manager));
Expand Down
Loading

1 comment on commit f54e633

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17540 seconds

Please sign in to comment.