Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/update-minimizers' into lr-giraffe
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed Dec 18, 2024
2 parents bcc6d66 + c88b64a commit 6d2c333
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 48 deletions.
2 changes: 1 addition & 1 deletion deps/gbwtgraph
66 changes: 35 additions & 31 deletions src/subcommand/minimizer_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@

#include <gbwtgraph/index.h>

//#define WRITE_MINIMIZER_ZIPCODES
using namespace vg;

// Using too many threads just wastes CPU time without speeding up the construction.
Expand Down Expand Up @@ -359,9 +358,9 @@ int main_minimizer(int argc, char** argv) {
//oversized_zipcodes may be stored alongside the minimizer index in the file specified by zipcode_name
ZipCodeCollection oversized_zipcodes;

//oversized_zipcodes will be made as zipcodes are found in minimizers, so there may be duplicates that
//only get stored once. This maps node id to the index in oversized_zipcodes
hash_map<vg::id_t, size_t> node_id_to_zipcode_index;
//Map node id to what gets stored in the payload - either the zipcode or index into oversized_zipcodes
hash_map<vg::id_t, gbwtgraph::Payload> node_id_to_payload;
node_id_to_payload.reserve(gbz->graph.max_node_id() - gbz->graph.min_node_id());

// Build the index.
if (progress) {
Expand All @@ -379,49 +378,54 @@ int main_minimizer(int argc, char** argv) {
});
} else {
gbwtgraph::index_haplotypes(gbz->graph, *index, [&](const pos_t& pos) -> gbwtgraph::Payload {
gbwtgraph::Payload payload = MIPayload::NO_CODE;

#pragma omp critical
{
//If we've already seen this node before, then return the saved payload
if (node_id_to_payload.count(id(pos))) {
payload = node_id_to_payload[id(pos)];
}
}
if (payload != MIPayload::NO_CODE) {
return payload;
}


ZipCode zipcode;
zipcode.fill_in_zipcode(*distance_index, pos);
#ifdef WRITE_MINIMIZER_ZIPCODES
//TODO: this is only for testing, can be taken out once the zip codes are done
//This should only be used single threaded.
//For each minimizer, writes the size of the zip code and then the zip code as a tsv
pair<size_t, size_t> value (0, 0);

//How many bytes get used
cout << zipcode.zipcode.byte_count();
//Each integer saved
while (value.second != std::numeric_limits<size_t>::max()) {
value = zipcode.zipcode.get_value_and_next_index(value.second);
cout << "\t" << value.first;
}
cout << endl;
#endif
auto payload = zipcode.get_payload_from_zip();

payload = zipcode.get_payload_from_zip();
if (payload != MIPayload::NO_CODE) {
//If the zipcode is small enough to store in the payload
#pragma omp critical
{
node_id_to_payload.emplace(id(pos), payload);
}
return payload;
} else if (!zipcode_name.empty()) {
//Otherwise, if they are being saved, add the zipcode to the oversized zipcode list
//And remember the zipcode

//Fill in the decoder to be saved too
zipcode.fill_in_full_decoder();


size_t zip_index;
#pragma omp critical
{
if (node_id_to_zipcode_index.count(id(pos))) {
zip_index = node_id_to_zipcode_index.at(id(pos));
} else {
oversized_zipcodes.emplace_back(zipcode);
zip_index = oversized_zipcodes.size() - 1;
node_id_to_zipcode_index.emplace(id(pos), zip_index);
oversized_zipcodes.emplace_back(zipcode);
size_t zip_index = oversized_zipcodes.size() - 1;
payload= {0, zip_index};
node_id_to_payload.emplace(id(pos), payload);
}
}
return {0, zip_index};
return payload;
} else {
return MIPayload::NO_CODE;
//If the zipcode is too big and we don't have a file to save the big zipcodes
#pragma omp critical
{
payload = MIPayload::NO_CODE;
node_id_to_payload.emplace(id(pos), payload);
}
return payload;
}
});
}
Expand Down
32 changes: 16 additions & 16 deletions src/zip_code.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ size_t ZipCode::get_length(const size_t& depth, bool get_chain_component_length)
}

//Otherwise, we didn't store the length
throw std::runtime_error("zipcodes don't store lengths of top-level chains or snarls");
throw std::runtime_error("zipcodes don't store lengths of top-level chains or snarls. Do your zipcode, minimizer, and graph files match?");
}
} else if (decoder[depth].is_chain) {
//If this is a chain/node
Expand Down Expand Up @@ -501,13 +501,13 @@ size_t ZipCode::get_rank_in_snarl(const size_t& depth) const {

if (depth == 0) {
//If this is the root chain/snarl/node
throw std::runtime_error("zipcodes don't store ranks of top-level chains or snarls");
throw std::runtime_error("zipcodes don't store ranks of top-level chains or snarls. Do your zipcode, minimizer, and graph files match?");

} else if (decoder[depth].is_chain) {
//If this is a chain/node

if (decoder[depth-1].is_chain) {
throw std::runtime_error("zipcodes trying to find the rank in snarl of a node in a chain");
throw std::runtime_error("zipcodes trying to find the rank in snarl of a node in a chain. Do your zipcode, minimizer, and graph files match?");
}

size_t zip_value;
Expand All @@ -518,7 +518,7 @@ size_t ZipCode::get_rank_in_snarl(const size_t& depth) const {
return zip_value;
} else {
//If this is a snarl
throw std::runtime_error("zipcodes don't store snarl ranks for snarls");
throw std::runtime_error("zipcodes don't store snarl ranks for snarls. Do your zipcode, minimizer, and graph files match?");
}
}

Expand All @@ -545,7 +545,7 @@ size_t ZipCode::get_snarl_child_count(const size_t& depth, const SnarlDistanceIn
return zip_value;
} else {
//If this is not a snarl
throw std::runtime_error("trying to get the snarl child count of a non-snarl zipcode");
throw std::runtime_error("trying to get the snarl child count of a non-snarl zipcode. Do your zipcode, minimizer, and graph files match?");
}
}

Expand All @@ -554,13 +554,13 @@ size_t ZipCode::get_offset_in_chain(const size_t& depth, const SnarlDistanceInde

if (depth == 0) {
//If this is the root chain/snarl/node
throw std::runtime_error("zipcodes don't have chain offsets for roots");
throw std::runtime_error("zipcodes don't have chain offsets for roots. Do your zipcode, minimizer, and graph files match?");

} else if (decoder[depth].is_chain) {
//If this is a chain/node

if (!decoder[depth-1].is_chain) {
throw std::runtime_error("zipcodes trying to find the offset in child of a snarl");
throw std::runtime_error("zipcodes trying to find the offset in child of a snarl. Do your zipcode, minimizer, and graph files match?");
}
size_t zip_value;
size_t zip_index = decoder[depth].offset;
Expand All @@ -586,13 +586,13 @@ size_t ZipCode::get_chain_component(const size_t& depth) const {

if (depth == 0) {
//If this is the root chain/snarl/node
throw std::runtime_error("zipcodes don't have chain offsets for roots");
throw std::runtime_error("zipcodes don't have chain offsets for roots. Do your zipcode, minimizer, and graph files match?");

} else if (decoder[depth].is_chain) {
//If this is a chain/node

if (!decoder[depth-1].is_chain) {
throw std::runtime_error("zipcodes trying to find the offset in child of a snarl");
throw std::runtime_error("zipcodes trying to find the offset in child of a snarl. Do your zipcode, minimizer, and graph files match?");
}
size_t zip_value;
size_t zip_index = decoder[depth].offset;
Expand All @@ -617,7 +617,7 @@ size_t ZipCode::get_chain_component(const size_t& depth) const {
size_t ZipCode::get_last_chain_component(const size_t& depth, bool get_end) const {

if (!decoder[depth].is_chain) {
throw std::runtime_error("zipcodes trying to find the last chain component a snarl");
throw std::runtime_error("zipcodes trying to find the last chain component a snarl. Do your zipcode, minimizer, and graph files match?");
}
size_t zip_value;
size_t zip_index = decoder[depth].offset;
Expand All @@ -638,7 +638,7 @@ size_t ZipCode::get_last_chain_component(const size_t& depth, bool get_end) cons
bool ZipCode::get_is_looping_chain(const size_t& depth) const {

if (!decoder[depth].is_chain) {
throw std::runtime_error("zipcodes trying to find the last chain component a snarl");
throw std::runtime_error("zipcodes trying to find the last chain component a snarl. Do your zipcode, minimizer, and graph files match?");
}
size_t zip_value;
size_t zip_index = decoder[depth].offset;
Expand Down Expand Up @@ -709,7 +709,7 @@ net_handle_t ZipCode::get_net_handle(const size_t& depth, const SnarlDistanceInd
} else if (decoder[depth].is_chain) {
//If this is a chain/node

throw std::runtime_error("zipcodes trying to get a handle of a chain or node");
throw std::runtime_error("zipcodes trying to get a handle of a chain or node. Do your zipcode, minimizer, and graph files match?");
} else {
//If this is a snarl

Expand All @@ -722,7 +722,7 @@ net_handle_t ZipCode::get_net_handle(const size_t& depth, const SnarlDistanceInd
if (zip_value == 1) {
//If this is a regular snarl

throw std::runtime_error("zipcodes trying to get a handle of a regular snarl");
throw std::runtime_error("zipcodes trying to get a handle of a regular snarl. Do your zipcode, minimizer, and graph files match?");
} else {
//Irregular snarl

Expand Down Expand Up @@ -816,7 +816,7 @@ size_t ZipCode::get_distance_index_address(const size_t& depth) const {
} else if (decoder[depth].is_chain) {
//If this is a chain/node

throw std::runtime_error("zipcodes trying to get a handle of a chain or node");
throw std::runtime_error("zipcodes trying to get a handle of a chain or node. Do your zipcode, minimizer, and graph files match?");
} else {
//If this is a snarl

Expand All @@ -829,7 +829,7 @@ size_t ZipCode::get_distance_index_address(const size_t& depth) const {
if (zip_value == 1) {
//If this is a regular snarl

throw std::runtime_error("zipcodes trying to get a handle of a regular ansl");
throw std::runtime_error("zipcodes trying to get a handle of a regular snarl. Do your zipcode, minimizer, and graph files match?");
} else {
//Irregular snarl

Expand Down Expand Up @@ -2504,7 +2504,7 @@ net_identifier_t ZipCode::get_identifier(size_t depth) const {

const net_identifier_t ZipCode::get_parent_identifier(const net_identifier_t& child) {
if (child == "ROOT") {
throw std::runtime_error("error: trying to get the parent of the root net_identifier_t");
throw std::runtime_error("error: trying to get the parent of the root net_identifier_t. Do your zipcode, minimizer, and graph files match?");
}
for (int i = child.size()-1 ; i >= 0 ; i--) {
if (child[i] == '.') {
Expand Down

0 comments on commit 6d2c333

Please sign in to comment.