Skip to content

Commit

Permalink
add chains stats (-C)
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed Dec 8, 2023
1 parent 710f00c commit ea744d7
Showing 1 changed file with 101 additions and 49 deletions.
150 changes: 101 additions & 49 deletions src/subcommand/stats_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ void help_stats(char** argv) {
<< " multiple allowed; limit comparison to those provided" << endl
<< " -O, --overlap-all print overlap table for the cartesian product of paths" << endl
<< " -R, --snarls print statistics for each snarl" << endl
<< " -C, --chains print statistics for each chain" << endl
<< " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " <<
"Can't detect Protobuf if graph read from stdin" << endl
<< " -D, --degree-dist print degree distribution of the graph." << endl
Expand Down Expand Up @@ -102,6 +103,7 @@ int main_stats(int argc, char** argv) {
vector<string> paths_to_overlap;
bool overlap_all_paths = false;
bool snarl_stats = false;
bool chain_stats = false;
bool format = false;
bool degree_dist = false;
string distance_index_filename;
Expand Down Expand Up @@ -132,6 +134,7 @@ int main_stats(int argc, char** argv) {
{"overlap", no_argument, 0, 'o'},
{"overlap-all", no_argument, 0, 'O'},
{"snarls", no_argument, 0, 'R'},
{"chains", no_argument, 0, 'C'},
{"format", no_argument, 0, 'F'},
{"degree-dist", no_argument, 0, 'D'},
{"dist-snarls", required_argument, 0, 'b'},
Expand All @@ -140,7 +143,7 @@ int main_stats(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORFDb:p:",
c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORCFDb:p:",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -229,6 +232,10 @@ int main_stats(int argc, char** argv) {
snarl_stats = true;
break;

case 'C':
chain_stats = true;
break;

case 'v':
verbose = true;
break;
Expand Down Expand Up @@ -1088,49 +1095,53 @@ int main_stats(int argc, char** argv) {


}

SnarlManager manager; // todo: option to read snarls
// We will track depth for each snarl (used for both snarl and chains stats)
unordered_map<const Snarl*, size_t> depth;

if (snarl_stats) {
if (snarl_stats || chain_stats) {
// We will go through all the snarls and compute stats.

require_graph();

// First compute the snarls
auto manager = IntegratedSnarlFinder(*graph).find_snarls_parallel();
manager = IntegratedSnarlFinder(*graph).find_snarls_parallel();

// We will track depth for each snarl
unordered_map<const Snarl*, size_t> depth;

// TSV header
cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n";

manager.for_each_snarl_preorder([&](const Snarl* snarl) {
// Loop over all the snarls and print stats.

// snarl
cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t";
cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t";
if (snarl_stats) {
// snarl
cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t";
cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t";

// Snarl metadata
cout << (snarl->type() == ULTRABUBBLE) << "\t";
cout << (snarl->type() == UNARY) << "\t";

// Snarl size not including boundary nodes
pair<unordered_set<vg::id_t>, unordered_set<vg::edge_t> > contents = manager.shallow_contents(snarl, *graph, false);
size_t num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";
contents = manager.deep_contents(snarl, *graph, false);
num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
// Snarl metadata
cout << (snarl->type() == ULTRABUBBLE) << "\t";
cout << (snarl->type() == UNARY) << "\t";

// Snarl size not including boundary nodes
pair<unordered_set<vg::id_t>, unordered_set<vg::edge_t> > contents = manager.shallow_contents(snarl, *graph, false);
size_t num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";
contents = manager.deep_contents(snarl, *graph, false);
num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";

// Compute depth
auto parent = manager.parent_of(snarl);
Expand All @@ -1140,35 +1151,76 @@ int main_stats(int argc, char** argv) {
} else {
depth[snarl] = depth[parent] + 1;
}
cout << depth[snarl] << "\t";

if (snarl_stats) {
cout << depth[snarl] << "\t";

// Number of children (looking inside chains)
cout << manager.children_of(snarl).size() << "\t";
// Number of children (looking inside chains)
cout << manager.children_of(snarl).size() << "\t";

// Number of chains (including unary child snarls)
// Will be 0 for leaves
auto chains = manager.chains_of(snarl);
cout << chains.size() << "\t";

for (size_t i = 0; i < chains.size(); ++i) {
// Number of children in each chain
cout << chains[i].size();
if (i < chains.size() - 1) {
cout << ",";
// Number of chains (including unary child snarls)
// Will be 0 for leaves
auto chains = manager.chains_of(snarl);
cout << chains.size() << "\t";

for (size_t i = 0; i < chains.size(); ++i) {
// Number of children in each chain
cout << chains[i].size();
if (i < chains.size() - 1) {
cout << ",";
}
}
if (chains.empty()) {
cout << "0";
}
cout << "\t";

// Net graph info
// Internal connectivity not important, we just want the size.
auto netGraph = manager.net_graph_of(snarl, graph, false);
cout << netGraph.get_node_count() << endl;
}
if (chains.empty()) {
cout << "0";
});

}

if (chain_stats) {
// We will go through all the chains and compute stats.


// TSV header
cout << "Snarl1-Start\tSSnarl1-Start-Reversed\tSnarl1-End\tSnarl1-End-Reversed\tSnarl1-Reversed\tSnarl2-Start\tSSnarl2-Start-Reversed\tSnarl2-End\tSnarl2-End-Reversed\tSnarl2-Reversed\tSnarl-Count\tMax-Depth\tChild-Snarls\tChild-Chains\n";

manager.for_each_chain([&](const Chain* chain) {
// Loop over all the snarls and print stats.

// snarl endpoints
cout << chain->front().first->start().node_id() << "\t" << chain->front().first->start().backward() << "\t"
<< chain->front().first->end().node_id() << "\t" << chain->front().first->end().backward() << "\t"
<< chain->front().second << "\t";
cout << chain->back().first->start().node_id() << "\t" << chain->back().first->start().backward() << "\t"
<< chain->back().first->end().node_id() << "\t" << chain->back().first->end().backward() << "\t"
<< chain->back().second << "\t";

// snarl count
cout << chain->size() << "\t";

int64_t max_depth = 0;
int64_t child_snarls = 0;
int64_t child_chains = 0;
for (const auto& sr : *chain) {
const Snarl* snarl = sr.first;
max_depth = max(max_depth, (int64_t)depth.at(snarl));
child_snarls += manager.children_of(snarl).size();
child_chains += manager.chains_of(snarl).size();
}
cout << "\t";

// Net graph info
// Internal connectivity not important, we just want the size.
auto netGraph = manager.net_graph_of(snarl, graph, false);
cout << netGraph.get_node_count() << endl;
cout << max_depth << "\t" << child_snarls << "\t" << child_chains;

cout << endl;
});

}


if (!distance_index_filename.empty()) {
//Print snarl stats from a distance index
Expand Down

1 comment on commit ea744d7

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch deconstruct. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17578 seconds

Please sign in to comment.