diff --git a/src/subcommand/stats_main.cpp b/src/subcommand/stats_main.cpp index cfe6754aa3b..b6560af2593 100644 --- a/src/subcommand/stats_main.cpp +++ b/src/subcommand/stats_main.cpp @@ -64,6 +64,7 @@ void help_stats(char** argv) { << " multiple allowed; limit comparison to those provided" << endl << " -O, --overlap-all print overlap table for the cartesian product of paths" << endl << " -R, --snarls print statistics for each snarl" << endl + << " -C, --chains print statistics for each chain" << endl << " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " << "Can't detect Protobuf if graph read from stdin" << endl << " -D, --degree-dist print degree distribution of the graph." << endl @@ -102,6 +103,7 @@ int main_stats(int argc, char** argv) { vector paths_to_overlap; bool overlap_all_paths = false; bool snarl_stats = false; + bool chain_stats = false; bool format = false; bool degree_dist = false; string distance_index_filename; @@ -132,6 +134,7 @@ int main_stats(int argc, char** argv) { {"overlap", no_argument, 0, 'o'}, {"overlap-all", no_argument, 0, 'O'}, {"snarls", no_argument, 0, 'R'}, + {"chains", no_argument, 0, 'C'}, {"format", no_argument, 0, 'F'}, {"degree-dist", no_argument, 0, 'D'}, {"dist-snarls", required_argument, 0, 'b'}, @@ -140,7 +143,7 @@ int main_stats(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORFDb:p:", + c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORCFDb:p:", long_options, &option_index); // Detect the end of the options. @@ -229,6 +232,10 @@ int main_stats(int argc, char** argv) { snarl_stats = true; break; + case 'C': + chain_stats = true; + break; + case 'v': verbose = true; break; @@ -1088,17 +1095,19 @@ int main_stats(int argc, char** argv) { } + + SnarlManager manager; // todo: option to read snarls + // We will track depth for each snarl (used for both snarl and chains stats) + unordered_map depth; - if (snarl_stats) { + if (snarl_stats || chain_stats) { // We will go through all the snarls and compute stats. require_graph(); // First compute the snarls - auto manager = IntegratedSnarlFinder(*graph).find_snarls_parallel(); + manager = IntegratedSnarlFinder(*graph).find_snarls_parallel(); - // We will track depth for each snarl - unordered_map depth; // TSV header cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n"; @@ -1106,31 +1115,33 @@ int main_stats(int argc, char** argv) { manager.for_each_snarl_preorder([&](const Snarl* snarl) { // Loop over all the snarls and print stats. - // snarl - cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t"; - cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t"; + if (snarl_stats) { + // snarl + cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t"; + cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t"; - // Snarl metadata - cout << (snarl->type() == ULTRABUBBLE) << "\t"; - cout << (snarl->type() == UNARY) << "\t"; - - // Snarl size not including boundary nodes - pair, unordered_set > contents = manager.shallow_contents(snarl, *graph, false); - size_t num_bases = 0; - for (vg::id_t node_id : contents.first) { - num_bases += graph->get_length(graph->get_handle(node_id)); - } - cout << contents.first.size() << "\t"; - cout << contents.second.size() << "\t"; - cout << num_bases << "\t"; - contents = manager.deep_contents(snarl, *graph, false); - num_bases = 0; - for (vg::id_t node_id : contents.first) { - num_bases += graph->get_length(graph->get_handle(node_id)); + // Snarl metadata + cout << (snarl->type() == ULTRABUBBLE) << "\t"; + cout << (snarl->type() == UNARY) << "\t"; + + // Snarl size not including boundary nodes + pair, unordered_set > contents = manager.shallow_contents(snarl, *graph, false); + size_t num_bases = 0; + for (vg::id_t node_id : contents.first) { + num_bases += graph->get_length(graph->get_handle(node_id)); + } + cout << contents.first.size() << "\t"; + cout << contents.second.size() << "\t"; + cout << num_bases << "\t"; + contents = manager.deep_contents(snarl, *graph, false); + num_bases = 0; + for (vg::id_t node_id : contents.first) { + num_bases += graph->get_length(graph->get_handle(node_id)); + } + cout << contents.first.size() << "\t"; + cout << contents.second.size() << "\t"; + cout << num_bases << "\t"; } - cout << contents.first.size() << "\t"; - cout << contents.second.size() << "\t"; - cout << num_bases << "\t"; // Compute depth auto parent = manager.parent_of(snarl); @@ -1140,35 +1151,76 @@ int main_stats(int argc, char** argv) { } else { depth[snarl] = depth[parent] + 1; } - cout << depth[snarl] << "\t"; + + if (snarl_stats) { + cout << depth[snarl] << "\t"; - // Number of children (looking inside chains) - cout << manager.children_of(snarl).size() << "\t"; + // Number of children (looking inside chains) + cout << manager.children_of(snarl).size() << "\t"; - // Number of chains (including unary child snarls) - // Will be 0 for leaves - auto chains = manager.chains_of(snarl); - cout << chains.size() << "\t"; - - for (size_t i = 0; i < chains.size(); ++i) { - // Number of children in each chain - cout << chains[i].size(); - if (i < chains.size() - 1) { - cout << ","; + // Number of chains (including unary child snarls) + // Will be 0 for leaves + auto chains = manager.chains_of(snarl); + cout << chains.size() << "\t"; + + for (size_t i = 0; i < chains.size(); ++i) { + // Number of children in each chain + cout << chains[i].size(); + if (i < chains.size() - 1) { + cout << ","; + } + } + if (chains.empty()) { + cout << "0"; } + cout << "\t"; + + // Net graph info + // Internal connectivity not important, we just want the size. + auto netGraph = manager.net_graph_of(snarl, graph, false); + cout << netGraph.get_node_count() << endl; } - if (chains.empty()) { - cout << "0"; + }); + + } + + if (chain_stats) { + // We will go through all the chains and compute stats. + + + // TSV header + cout << "Snarl1-Start\tSSnarl1-Start-Reversed\tSnarl1-End\tSnarl1-End-Reversed\tSnarl1-Reversed\tSnarl2-Start\tSSnarl2-Start-Reversed\tSnarl2-End\tSnarl2-End-Reversed\tSnarl2-Reversed\tSnarl-Count\tMax-Depth\tChild-Snarls\tChild-Chains\n"; + + manager.for_each_chain([&](const Chain* chain) { + // Loop over all the snarls and print stats. + + // snarl endpoints + cout << chain->front().first->start().node_id() << "\t" << chain->front().first->start().backward() << "\t" + << chain->front().first->end().node_id() << "\t" << chain->front().first->end().backward() << "\t" + << chain->front().second << "\t"; + cout << chain->back().first->start().node_id() << "\t" << chain->back().first->start().backward() << "\t" + << chain->back().first->end().node_id() << "\t" << chain->back().first->end().backward() << "\t" + << chain->back().second << "\t"; + + // snarl count + cout << chain->size() << "\t"; + + int64_t max_depth = 0; + int64_t child_snarls = 0; + int64_t child_chains = 0; + for (const auto& sr : *chain) { + const Snarl* snarl = sr.first; + max_depth = max(max_depth, (int64_t)depth.at(snarl)); + child_snarls += manager.children_of(snarl).size(); + child_chains += manager.chains_of(snarl).size(); } - cout << "\t"; - - // Net graph info - // Internal connectivity not important, we just want the size. - auto netGraph = manager.net_graph_of(snarl, graph, false); - cout << netGraph.get_node_count() << endl; + cout << max_depth << "\t" << child_snarls << "\t" << child_chains; + + cout << endl; }); } + if (!distance_index_filename.empty()) { //Print snarl stats from a distance index