From 4e5ef9ef54c3e31802d98947968bee3bdd72c983 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 6 Oct 2023 13:10:26 -0700 Subject: [PATCH 01/43] point submods at same commit of libhandlegraph --- deps/libbdsg | 2 +- deps/libhandlegraph | 2 +- deps/xg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 97e42e0fb0f..f801d45995e 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 97e42e0fb0fe52c0953f52ba971317f83612726b +Subproject commit f801d45995ee8d7f50ec8066234543fda2b0314d diff --git a/deps/libhandlegraph b/deps/libhandlegraph index 0b519b72bec..b66842d9c99 160000 --- a/deps/libhandlegraph +++ b/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit 0b519b72becbeb8f56f0e3478a1aef54fa241106 +Subproject commit b66842d9c99efca74ee56b09b8bc3d21139ac641 diff --git a/deps/xg b/deps/xg index f2e565d9c5b..e05fa8374b5 160000 --- a/deps/xg +++ b/deps/xg @@ -1 +1 @@ -Subproject commit f2e565d9c5bd8a0bde3da69e365b8fc29d2bf7b7 +Subproject commit e05fa8374b53f1193100854da7fd06889f21203f From 99c3954468314bfa4e8ad4569674da40cdde4c49 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 6 Oct 2023 14:07:16 -0700 Subject: [PATCH 02/43] update libvgio --- deps/libvgio | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libvgio b/deps/libvgio index 1bc5ce5fc9b..f9c27978a0b 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 1bc5ce5fc9b1c2938ad937b29962f1e4220fdb9b +Subproject commit f9c27978a0b04c644b7b90b05f63112be8a1dc6a From 2d4be84d76921144209b77c93128c6ac9abd8657 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Tue, 10 Oct 2023 17:02:04 -0700 Subject: [PATCH 03/43] save --- src/unittest/primer_filter.cpp | 350 +++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 src/unittest/primer_filter.cpp diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp new file mode 100644 index 00000000000..bbc2d29f988 --- /dev/null +++ b/src/unittest/primer_filter.cpp @@ -0,0 +1,350 @@ +// +// primer_filter.cpp +// +// Unit tests for primer filter +// + +#include +#include +#include +#include +#include +#include +#include +#include "vg/io/json2pb.h" +#include +#include "catch.hpp" +#include "random_graph.hpp" +#include "randomness.hpp" +#include "../snarl_distance_index.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../genotypekit.hpp" +#include "../traversal_finder.hpp" +#include +#include +#include "xg.hpp" + +namespace vg { + namespace unittest { + + TEST_CASE( "filter simple primers", + "[primers]" ) { + + struct Primer { + string sequence; + bool left = true; + size_t position; + size_t length; + size_t left_offset; + size_t right_offset; + vector mapped_nodes_ids; + }; + + struct Primer_pair { + Primer left_primer; + Primer right_primer; + size_t product_size; + }; + + class Primer_finder { + private: + vector primer_pairs; + // HandleGraph* graph; + PathPositionHandleGraph* graph; + SnarlDistanceIndex* distance_index; + path_handle_t reference_path_handle; + vector selected_primer_pairs; + + public: + Primer_finder() = default; + Primer_finder( + //unique_ptr& graph_param, + unique_ptr& graph_param, + string reference_path_name, + SnarlDistanceIndex* distance_index_param) { + graph = graph_param.get(); + reference_path_handle = graph->get_path_handle("y"); + distance_index = distance_index_param; + } + ~Primer_finder() = default; + + + void load_primers(string path_to_primers) { + regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); + regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); + regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); + regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); + + Primer left_primer {""}; + Primer right_primer {"", false}; + + ifstream file_handle(path_to_primers); + if (file_handle.is_open()) { + string line; + while (getline(file_handle, line)) { + line = rstrip(line); + smatch match; + if (regex_search(line, match, left_seq_pattern)) { + if (right_primer.sequence != "") { + map_to_nodes(left_primer); + map_to_nodes(right_primer); + Primer_pair primer_pair{left_primer, right_primer, + right_primer.position - left_primer.position + 1}; + primer_pairs.push_back(primer_pair); + } + left_primer.sequence = match[1]; + } else if (regex_search(line, match, right_seq_pattern)) { + right_primer.sequence = match[1]; + } else if (regex_search(line, match, left_pos_pattern)) { + vector pos_and_len = split(match[1], ","); + left_primer.position = stoi(pos_and_len[0]); + left_primer.length = stoi(pos_and_len[1]); + } else if (regex_search(line, match, right_pos_pattern)) { + vector pos_and_len = split(match[1], ","); + right_primer.position = stoi(pos_and_len[0]); + right_primer.length = stoi(pos_and_len[1]); + } + } + Primer_pair primer_pair{left_primer, right_primer, + right_primer.position - left_primer.position + 1}; + primer_pairs.push_back(primer_pair); + } + } + + + void map_to_nodes(Primer& primer) { + string primer_seq; + if (primer.left) { + primer_seq = primer.sequence; + } else { + primer_seq = revcomp(primer.sequence); + } + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); + handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); + string cur_node_sequence = graph->get_sequence(cur_node_handle); + size_t primer_matched_index = longest_match_len(cur_node_sequence, primer_seq) - 1; + while (primer_matched_index <= primer_seq.size()-1) { + cur_node_step_handle = graph->get_next_step(cur_node_step_handle); + cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); + cur_node_sequence = graph->get_sequence(cur_node_handle); + string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); + primer_matched_index += longest_match_len(primer_substr, cur_node_sequence); + } + } + + + void run_test() { + cout << "testing HandleGraph..." << endl; + nid_t min_node_id = graph->min_node_id(); + nid_t max_node_id = graph->max_node_id(); + handle_t min_node = graph->get_handle(min_node_id); + handle_t max_node = graph->get_handle(max_node_id); + cout << "min node id: " << min_node_id << endl; + cout << "sequence: " << graph->get_sequence(min_node) << endl; + cout << "max node id: " << max_node_id << endl; + cout << "sequence: " << graph->get_sequence(max_node) << endl; + cout << "HandleGraph works! :)" << endl; + + cout << "-------------------------------------" << endl; + + cout << "testing PathHandleGraph..." << endl; + cout << graph->get_path_count() << endl; + cout << "path with name y exists: " << graph->has_path("y") << endl; + cout << "reference path has " << graph->get_step_count(reference_path_handle) << + " node steps" << endl; + cout << "PathHandleGraph works! :)" << endl; + + cout << "-------------------------------------" << endl; + + cout << "testing PathPositionHandleGraph..." << endl; + cout << "referecne path length: " << graph->get_path_length(reference_path_handle) << endl; + step_handle_t step_handle_lprimer = graph->get_step_at_position(reference_path_handle, 362); // 362 is the position of the left primer of the first primer pair + handle_t handle_lprimer = graph->get_handle_of_step(step_handle_lprimer); + cout << "left primer sequence begin node seq: " << graph->get_sequence(handle_lprimer) << endl; + step_handle_t step_handle_lprimer_next = graph->get_next_step(step_handle_lprimer); + handle_t handle_lprimer_next = graph->get_handle_of_step(step_handle_lprimer_next); + cout << "left priemr sequence second node seq: " << graph->get_sequence(handle_lprimer_next) << endl; + + // Get node id for position 0 + step_handle_t position_0_step_handle = graph->get_step_at_position(reference_path_handle, 0); + handle_t position_0_handle = graph->get_handle_of_step(position_0_step_handle); + cout << "node id for position 0 " << graph->get_id(position_0_handle) << endl; + // Get node id for position 1 + step_handle_t position_1_step_handle = graph->get_step_at_position(reference_path_handle, 1); + handle_t position_1_handle = graph->get_handle_of_step(position_1_step_handle); + cout << "node id for position 1 " << graph->get_id(position_1_handle) << endl; + + // Get node id for position 31 + step_handle_t position_31_step_handle = graph->get_step_at_position(reference_path_handle, 31); + handle_t position_31_handle = graph->get_handle_of_step(position_31_step_handle); + cout << "node id for position 31 " << graph->get_id(position_31_handle) << endl; + + // Get node id for position 32 + step_handle_t position_32_step_handle = graph->get_step_at_position(reference_path_handle, 32); + handle_t position_32_handle = graph->get_handle_of_step(position_32_step_handle); + cout << "node id for position 32 " << graph->get_id(position_32_handle) << endl; + + cout << "PathPositionHandleGraph works! :)" << endl; + + cout << "-------------------------------------" << endl; + + cout << "testing SnarlDistanceIndex..." << endl; + net_handle_t root_node = distance_index->get_root(); + cout << "is root a root? " << distance_index->is_root(root_node) << endl; + cout << "is root a node? " << distance_index->is_node(root_node) << endl; + cout << "is root a snarl? " << distance_index->is_snarl(root_node) << endl; + cout << "is root a chain? " << distance_index->is_chain(root_node) << endl; + cout << "depth of root is: " << distance_index->get_depth(root_node) << endl; + + net_handle_t min_node_net_handle = distance_index->get_net(min_node, graph); + cout << "depth of min node is: " << distance_index->get_depth(min_node_net_handle) << endl; + cout << "make sure that min node net handle is a node: " << distance_index->is_node(min_node_net_handle) << endl; + + cout << "SnarlDistanceIndex works! :)" << endl; + cout << "-------------------------------------" << endl; + + cout << "testing load_primers..." << endl; + for (vector::iterator it = primer_pairs.begin(); it != primer_pairs.end(); ++it) { + Primer left_primer = it->left_primer; + Primer right_primer = it->right_primer; + cout << "product size: " << it->product_size << endl; + cout << left_primer.left << " " << left_primer.position << " " << + left_primer.length << " " << left_primer.sequence << endl; + for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { + size_t cur_node_id = left_primer.mapped_nodes_ids[i]; + handle_t cur_node_handle = graph->get_handle(cur_node_id); + cout << graph->get_sequence(cur_node_handle) << " "; + } + cout << endl; + cout << right_primer.left << " " << right_primer.position << " " << + right_primer.length << " " << right_primer.sequence << " " << revcomp(right_primer.sequence) << endl; + for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { + size_t cur_node_id = left_primer.mapped_nodes_ids[i]; + handle_t cur_node_handle = graph->get_handle(cur_node_id); + cout << graph->get_sequence(cur_node_handle) << " "; + } + cout << endl; + cout << endl; + } + cout << "load_primers works! :)" << endl; + cout << "-------------------------------------" << endl; + + } + // void filter_primer() { + + //} + + private: + // Functions only used in load_primers().. Not sure where to put them for now + string rstrip(string const s) { + const string WHITESPACE = " \n\r\t\f\v"; + size_t end = s.find_last_not_of(WHITESPACE); + if (end == string::npos) { + return ""; + } + return s.substr(0, end+1); + } + + size_t longest_match_len(string const left_seq, string const right_seq) { + size_t llen = left_seq.size(), rlen = right_seq.size(); + size_t length = min(llen, rlen); + size_t longest_match = 0; + for (int i = 1; i <= length; i++) { + if (right_seq.substr(0, length) == left_seq.substr(rlen - i, i)) { + longest_match = i; + } + } + return longest_match; + } + + char complement(char nt) { + switch(nt) { + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + case 'T': return 'A'; + } + return 'N'; + } + + string revcomp(string const seq) { + string revcomp_seq; + for (int i = seq.size()-1; i >= 0; i--) { + revcomp_seq += complement(seq[i]); + } + return revcomp_seq; + } + + vector split(string str, string const delim) { + size_t cur_pos = 0; + string word; + vector word_list; + while ((cur_pos = str.find(delim)) != string::npos) { + word = str.substr(0, cur_pos); + word_list.push_back(word); + str.erase(0, cur_pos + delim.length()); + } + word = str; + word_list.push_back(word); + return word_list; + } + }; + + SnarlDistanceIndex distance_index; + //unique_ptr graph; + unique_ptr graph; + string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; + string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; + distance_index.deserialize(snarl_index_path); + //graph = vg::io::VPKG::load_one(xg_graph_path); + graph = vg::io::VPKG::load_one(xg_graph_path); + + //Primer_finder primer_finder; + Primer_finder primer_finder(graph, "y", &distance_index); + primer_finder.load_primers(primers_path); + primer_finder.run_test(); + + + + + // SnarlDistanceIndex distance_index; + + // string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.dist"; + // string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.xg"; + + // distance_index.deserialize(snarl_index_path); + // unique_ptr graph = vg::io::VPKG::load_one(xg_graph_path); + + // net_handle_t node = distance_index.get_node_net_handle(1); + // cout << distance_index.net_handle_as_string(node) << endl; + + // net_handle_t root_node = distance_index.get_root(); + // cout << "root: " << distance_index.net_handle_as_string(root_node) << endl; + + // cout << distance_index.is_root(node) << endl; + // cout << distance_index.is_snarl(node) << endl; + // cout << distance_index.is_chain(node) << endl; + // cout << distance_index.is_node(node) << endl; + // cout << "--------------------------------------------" << endl; + // cout << graph->get_node_count() << endl; + // cout << graph->get_edge_count() << endl; + // nid_t min_node_id = graph->min_node_id(); + // cout << graph->has_node(min_node_id) << endl; + // nid_t max_node_id = graph->max_node_id(); + // cout << min_node_id << endl; + // cout << max_node_id << endl; + // cout << "--------------------------------------------" << endl; + // handle_t min_node = graph->get_handle(min_node_id); + // cout << graph->get_sequence(min_node) << endl; + // handle_t max_node = graph->get_handle(max_node_id); + // cout << graph->get_sequence(max_node) << endl; + + + } + + } + +} \ No newline at end of file From 5ff9c6517b95d1dab136bb29cb3ea33ef9f21941 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Wed, 11 Oct 2023 00:06:25 -0700 Subject: [PATCH 04/43] update --- src/unittest/primer_filter.cpp | 50 +++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index bbc2d29f988..89ed4a9cd70 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -35,15 +35,16 @@ namespace vg { bool left = true; size_t position; size_t length; - size_t left_offset; - size_t right_offset; + size_t offset; vector mapped_nodes_ids; }; struct Primer_pair { Primer left_primer; Primer right_primer; - size_t product_size; + size_t linear_product_size; + size_t min_product_size; + size_t max_product_size; }; class Primer_finder { @@ -91,6 +92,8 @@ namespace vg { Primer_pair primer_pair{left_primer, right_primer, right_primer.position - left_primer.position + 1}; primer_pairs.push_back(primer_pair); + left_primer = {""}; + right_primer = {"", false}; } left_primer.sequence = match[1]; } else if (regex_search(line, match, right_seq_pattern)) { @@ -101,10 +104,13 @@ namespace vg { left_primer.length = stoi(pos_and_len[1]); } else if (regex_search(line, match, right_pos_pattern)) { vector pos_and_len = split(match[1], ","); - right_primer.position = stoi(pos_and_len[0]); right_primer.length = stoi(pos_and_len[1]); + right_primer.position = stoi(pos_and_len[0]) - right_primer.length; + } } + map_to_nodes(left_primer); + map_to_nodes(right_primer); Primer_pair primer_pair{left_primer, right_primer, right_primer.position - left_primer.position + 1}; primer_pairs.push_back(primer_pair); @@ -123,14 +129,14 @@ namespace vg { handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); string cur_node_sequence = graph->get_sequence(cur_node_handle); - size_t primer_matched_index = longest_match_len(cur_node_sequence, primer_seq) - 1; - while (primer_matched_index <= primer_seq.size()-1) { + size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq) - 1; + while (primer_matched_index < primer_seq.size()-1) { cur_node_step_handle = graph->get_next_step(cur_node_step_handle); cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); cur_node_sequence = graph->get_sequence(cur_node_handle); string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); - primer_matched_index += longest_match_len(primer_substr, cur_node_sequence); + primer_matched_index += longest_match_len(primer, primer_substr, cur_node_sequence); } } @@ -202,6 +208,12 @@ namespace vg { cout << "depth of min node is: " << distance_index->get_depth(min_node_net_handle) << endl; cout << "make sure that min node net handle is a node: " << distance_index->is_node(min_node_net_handle) << endl; + size_t min_dist_12_17 = distance_index->minimum_distance(12, false, 1, 17, false, 2); + size_t max_dist_12_17 = distance_index->maximum_distance(12, false, 3, 17, false, 3); + + cout << "min dist between node 12 and node 17: " << min_dist_12_17 << endl; + cout << "max dist between node 12 and node 17: " << max_dist_12_17 << endl; + cout << "SnarlDistanceIndex works! :)" << endl; cout << "-------------------------------------" << endl; @@ -209,7 +221,7 @@ namespace vg { for (vector::iterator it = primer_pairs.begin(); it != primer_pairs.end(); ++it) { Primer left_primer = it->left_primer; Primer right_primer = it->right_primer; - cout << "product size: " << it->product_size << endl; + cout << "product size: " << it->linear_product_size << endl; cout << left_primer.left << " " << left_primer.position << " " << left_primer.length << " " << left_primer.sequence << endl; for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { @@ -220,8 +232,8 @@ namespace vg { cout << endl; cout << right_primer.left << " " << right_primer.position << " " << right_primer.length << " " << right_primer.sequence << " " << revcomp(right_primer.sequence) << endl; - for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { - size_t cur_node_id = left_primer.mapped_nodes_ids[i]; + for (int i = 0; i < right_primer.mapped_nodes_ids.size(); i++) { + size_t cur_node_id = right_primer.mapped_nodes_ids[i]; handle_t cur_node_handle = graph->get_handle(cur_node_id); cout << graph->get_sequence(cur_node_handle) << " "; } @@ -247,15 +259,27 @@ namespace vg { return s.substr(0, end+1); } - size_t longest_match_len(string const left_seq, string const right_seq) { + size_t longest_match_len(Primer& primer, string const left_seq, string const right_seq) { size_t llen = left_seq.size(), rlen = right_seq.size(); size_t length = min(llen, rlen); size_t longest_match = 0; - for (int i = 1; i <= length; i++) { - if (right_seq.substr(0, length) == left_seq.substr(rlen - i, i)) { + + // Change .. can be done in one for loop + if (llen >= rlen) { + for (size_t i = 0; i <= llen - rlen; i++) { + if (left_seq.substr(i, rlen) == right_seq) { + longest_match = rlen; + return longest_match; + } + } + } + + for (size_t i = 1; i <= length; i++) { + if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { longest_match = i; } } + return longest_match; } From 04ab25a312d18eade4df0b9e970729366b1b96f7 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Fri, 13 Oct 2023 12:15:32 -0700 Subject: [PATCH 05/43] update --- src/primer_filter.cpp | 254 +++++++++++++++++ src/primer_filter.hpp | 163 +++++++++++ src/unittest/primer_filter.cpp | 377 +++---------------------- src/unittest/primers.cpp | 499 +++++++++++++++++++++++++++++++++ 4 files changed, 949 insertions(+), 344 deletions(-) create mode 100644 src/primer_filter.cpp create mode 100644 src/primer_filter.hpp create mode 100644 src/unittest/primers.cpp diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp new file mode 100644 index 00000000000..d772cacb6e2 --- /dev/null +++ b/src/primer_filter.cpp @@ -0,0 +1,254 @@ +#include "primer_filter.hpp" + +namespace vg { + +using namespace std; + +Primer_finder::Primer_finder( unique_ptr& graph_param, + string reference_path_name, SnarlDistanceIndex* distance_index_param) { + graph = graph_param.get(); + reference_path_handle = graph->get_path_handle("y"); + distance_index = distance_index_param; +} + +Primer_finder::~Primer_finder() { + // nothing to do +} + +vector Primer_finder::get_primer_pairs() { + return primer_pairs; +} + +vector Primer_finder::get_selected_primer_pairs() { + return selected_primer_pairs; +} + +void Primer_finder::add_primer_pair(size_t left_primer_starting_node_id, + size_t left_primer_offset, size_t left_primer_length, + size_t right_primer_starting_node_id, + size_t right_primer_offset, size_t right_primer_length) { + Primer left_primer; + Primer right_primer; + make_primer(left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); + make_primer(right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); + Primer_pair new_primer_pair {left_primer, right_primer, + right_primer.position - left_primer.position + right_primer.length}; + primer_pairs.push_back(new_primer_pair); + if (no_variation(new_primer_pair)) { + selected_primer_pairs.push_back(new_primer_pair); + } +} + +void Primer_finder::load_primers(string path_to_primers) { + regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); + regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); + regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); + regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); + + Primer left_primer {""}; + Primer right_primer {"", false}; + + ifstream file_handle(path_to_primers); + assert(file_handle.is_open()); + + string line; + while (getline(file_handle, line)) { + line = rstrip(line); + smatch match; + if (regex_search(line, match, left_seq_pattern)) { + if (right_primer.sequence != "") { + map_to_nodes(left_primer); + map_to_nodes(right_primer); + Primer_pair primer_pair {left_primer, right_primer, + right_primer.position - left_primer.position + right_primer.length}; + primer_pairs.push_back(primer_pair); + if (no_variation(primer_pair)) { + selected_primer_pairs.push_back(primer_pair); + } + left_primer = {""}; + right_primer = {"", false}; + } + left_primer.sequence = match[1]; + } else if (regex_search(line, match, right_seq_pattern)) { + right_primer.sequence = match[1]; + } else if (regex_search(line, match, left_pos_pattern)) { + vector pos_and_len = split(match[1], ","); + left_primer.position = stoi(pos_and_len[0]); + left_primer.length = stoi(pos_and_len[1]); + } else if (regex_search(line, match, right_pos_pattern)) { + vector pos_and_len = split(match[1], ","); + right_primer.length = stoi(pos_and_len[1]); + right_primer.position = stoi(pos_and_len[0]) - right_primer.length + 1; + + } + } + + // Process and store the last pair of primers + map_to_nodes(left_primer); + map_to_nodes(right_primer); + Primer_pair primer_pair {left_primer, right_primer, + right_primer.position - left_primer.position + right_primer.length}; + if (no_variation(primer_pair)) { + selected_primer_pairs.push_back(primer_pair); + } + primer_pairs.push_back(primer_pair); +} + +void Primer_finder::make_primer(Primer& primer, size_t starting_node_id, size_t offset, size_t length, bool is_left) { + if (is_left) { + primer.left = true; + } else { + primer.left = false; + } + primer.length = length; + string sequence = ""; + handle_t cur_handle = graph->get_handle(starting_node_id); + step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; + primer.position = graph->get_position_of_step(cur_step_handle) + offset; + if (graph->get_length(cur_handle) - offset > length) { + sequence += graph->get_sequence(cur_handle).substr(offset, length); + } else { + sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); + while (sequence.size() < length) { + cur_step_handle = graph->get_next_step(cur_step_handle); + cur_handle = graph->get_handle_of_step(cur_step_handle); + sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); + } + } + + if (is_left) { + primer.sequence = sequence; + } else { + primer.sequence = revcomp(sequence); + } + map_to_nodes(primer); +} + +void Primer_finder::map_to_nodes(Primer& primer) { + string primer_seq; + if (primer.left) { + primer_seq = primer.sequence; + } else { + primer_seq = revcomp(primer.sequence); + } + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); + handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); + string cur_node_sequence = graph->get_sequence(cur_node_handle); + size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq, true) - 1; + while (primer_matched_index < primer_seq.size()-1) { + cur_node_step_handle = graph->get_next_step(cur_node_step_handle); + cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); + cur_node_sequence = graph->get_sequence(cur_node_handle); + string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); + primer_matched_index += longest_match_len(primer, primer_substr, cur_node_sequence, false); + } +} + +size_t Primer_finder::longest_match_len(Primer& primer, string const left_seq, string const right_seq, + bool const first_node) { + size_t llen = left_seq.size(), rlen = right_seq.size(); + size_t length = min(llen, rlen); + size_t longest_match = 0; + + // Change .. can be done in one for loop + if (first_node && llen >= rlen) { + for (size_t i = 0; i <= llen - rlen; i++) { + if (left_seq.substr(i, rlen) == right_seq) { + longest_match = rlen; + if (primer.left) { + primer.offset = i; + } else { + primer.offset = i + primer.sequence.size(); + } + return longest_match; + } + } + } + + if (first_node) { + for (size_t i = 1; i <= length; i++) { + if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { + longest_match = i; + if (primer.left && first_node) { + primer.offset = llen - i; + } else if (!primer.left) { + primer.offset = i; + } + } + } + } else { + for (size_t i = 1; i <= length; i++) { + if (left_seq.substr(0, i) == right_seq.substr(0, i)) { + longest_match = i; + if (!primer.left) { + primer.offset = i; + } + } + } + } + + return longest_match; +} + +string Primer_finder::rstrip(string const s) { + const string WHITESPACE = " \n\r\t\f\v"; + size_t end = s.find_last_not_of(WHITESPACE); + if (end == string::npos) { + return ""; + } + return s.substr(0, end+1); +} + + +bool Primer_finder::no_variation(const Primer_pair& primer_pair) { + Primer left_primer = primer_pair.left_primer; + Primer right_primer = primer_pair.right_primer; + for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { + handle_t cur_handle = graph->get_handle(*node_id); + net_handle_t cur_net_handle = distance_index->get_net(cur_handle, graph); + size_t depth = distance_index->get_depth(cur_net_handle); + if (depth != 1) { + return false; + } + } + return true; +} + + +char Primer_finder::complement(char nt) { + switch(nt) { + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + case 'T': return 'A'; + } + return 'N'; +} + + +string Primer_finder::revcomp(string const seq) { + string revcomp_seq; + for (int i = seq.size()-1; i >= 0; i--) { + revcomp_seq += complement(seq[i]); + } + return revcomp_seq; +} + + +vector Primer_finder::split(string str, string const delim) { + size_t cur_pos = 0; + string word; + vector word_list; + while ((cur_pos = str.find(delim)) != string::npos) { + word = str.substr(0, cur_pos); + word_list.push_back(word); + str.erase(0, cur_pos + delim.length()); + } + word = str; + word_list.push_back(word); + return word_list; +} + +} \ No newline at end of file diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp new file mode 100644 index 00000000000..73aaf487881 --- /dev/null +++ b/src/primer_filter.hpp @@ -0,0 +1,163 @@ +// primer_filter.hpp +// +// Contains class Primer_finder for storing and filtering primers predicted +// using Primer3. Also contains Primer struct and Primer_pair struct that stores +// information on primers and primer pairs. + +#ifndef VG_PRIMER_FILTER_HPP_INCLUDED +#define VG_PRIMER_FILTER_HPP_INCLUDED + +// Not sure what to include.. Will include everything from the unittest for now +#include +#include +#include +#include +#include +#include +#include +#include +#include "snarl_distance_index.hpp" +#include "integrated_snarl_finder.hpp" +#include "genotypekit.hpp" +#include "traversal_finder.hpp" +#include +#include + +using namespace std; + +namespace vg { + +struct Primer { + string sequence; + bool left = true; + size_t position; + size_t length; + size_t offset; + vector mapped_nodes_ids; +}; + +struct Primer_pair { + Primer left_primer; + Primer right_primer; + size_t linear_product_size; + size_t min_product_size; + size_t max_product_size; +}; + +class Primer_finder { + +private: + vector primer_pairs; + vector selected_primer_pairs; + PathPositionHandleGraph* graph; + SnarlDistanceIndex* distance_index; + path_handle_t reference_path_handle; + +public: + Primer_finder() = default; + + /** + * Construct Primer finder given PathPositionHandleGraph, reference graph name + * and pointer to SnarlDistanceIndex + */ + Primer_finder(unique_ptr& graph_param, + string reference_path_name, SnarlDistanceIndex* distance_index_param); + + /** + * Destructor + */ + ~Primer_finder(); + + /** + * Add a Primer_pair object given primers' starting node id, offset relative + * to the starting node, and length, all in the POSTIVE orientation. The new + * primer_pair object is automatically added to primer_pairs vector - and + * selected_primer_pairs if conditions are met. Mainly used for unit testing. + */ + void add_primer_pair(size_t left_primer_starting_node_id, + size_t left_primer_offset, size_t left_primer_length, + size_t right_primer_starting_node_id, + size_t right_primer_offset, size_t right_primer_length); + + /** + * Read the path to the primer3 output. Primers information is parsed, + * processed, and stored in primer_pairs vector - and selected_primer_pairs + * if conditions are met. + */ + void load_primers(string path_to_primers); + + /** + * return vector of Primer pairs + */ + vector get_primer_pairs(); + + /** + * return vector selected primer pairs + */ + vector get_selected_primer_pairs(); + +private: + /** + * Private functions used by public or private functions. + */ + + /** + * Update a Primer object given starting node id, offset relative to the starting node, + * and the length of primer. + * Used in: add_primer_pair + */ + void make_primer(Primer& primer, size_t starting_node_id, size_t offset, size_t length, bool is_left); + + /** + * Find and store corresponding node ids to Primer object. + * Used in: make_primer + * load_primers + */ + void map_to_nodes(Primer& primer); + + /** + * Find the length of the longest match between two sequences. Also find and + * store offset in Primer object. + * Used in: map_to_nodes + */ + size_t longest_match_len(Primer& primer, string const left_seq, string const right_seq, + bool const first_node); + + /** + * Strip empty spaces on the right side of a string. + * Used in: load_primers + */ + string rstrip(string const s); + + /** + * Check if primers in a primer_pair object have variations on the pangenome. + * Used in: add_primer_node + * load_primers + */ + bool no_variation(const Primer_pair& primer_pair); + + /** + * return the complement of a nucleotide. + * Used in: revcomp + */ + char complement(char nt); + + /** + * return the reverse complement of a sequence. + * Used in: make_primers + * map_to_nodes + * + */ + string revcomp(string const seq); + + /** + * Split a string into vectors. + * Used in: load_priemrs + */ + vector split(string str, string const delim); + +}; + +} + +#endif /* primder_filter_hpp */ \ No newline at end of file diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 89ed4a9cd70..05cf7a1d578 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -1,5 +1,5 @@ // -// primer_filter.cpp +// primers.cpp // // Unit tests for primer filter // @@ -23,352 +23,41 @@ #include #include #include "xg.hpp" +#include "../primer_filter.hpp" namespace vg { - namespace unittest { - - TEST_CASE( "filter simple primers", - "[primers]" ) { - - struct Primer { - string sequence; - bool left = true; - size_t position; - size_t length; - size_t offset; - vector mapped_nodes_ids; - }; - - struct Primer_pair { - Primer left_primer; - Primer right_primer; - size_t linear_product_size; - size_t min_product_size; - size_t max_product_size; - }; - - class Primer_finder { - private: - vector primer_pairs; - // HandleGraph* graph; - PathPositionHandleGraph* graph; - SnarlDistanceIndex* distance_index; - path_handle_t reference_path_handle; - vector selected_primer_pairs; - - public: - Primer_finder() = default; - Primer_finder( - //unique_ptr& graph_param, - unique_ptr& graph_param, - string reference_path_name, - SnarlDistanceIndex* distance_index_param) { - graph = graph_param.get(); - reference_path_handle = graph->get_path_handle("y"); - distance_index = distance_index_param; - } - ~Primer_finder() = default; - - - void load_primers(string path_to_primers) { - regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); - regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); - regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); - regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); - - Primer left_primer {""}; - Primer right_primer {"", false}; - - ifstream file_handle(path_to_primers); - if (file_handle.is_open()) { - string line; - while (getline(file_handle, line)) { - line = rstrip(line); - smatch match; - if (regex_search(line, match, left_seq_pattern)) { - if (right_primer.sequence != "") { - map_to_nodes(left_primer); - map_to_nodes(right_primer); - Primer_pair primer_pair{left_primer, right_primer, - right_primer.position - left_primer.position + 1}; - primer_pairs.push_back(primer_pair); - left_primer = {""}; - right_primer = {"", false}; - } - left_primer.sequence = match[1]; - } else if (regex_search(line, match, right_seq_pattern)) { - right_primer.sequence = match[1]; - } else if (regex_search(line, match, left_pos_pattern)) { - vector pos_and_len = split(match[1], ","); - left_primer.position = stoi(pos_and_len[0]); - left_primer.length = stoi(pos_and_len[1]); - } else if (regex_search(line, match, right_pos_pattern)) { - vector pos_and_len = split(match[1], ","); - right_primer.length = stoi(pos_and_len[1]); - right_primer.position = stoi(pos_and_len[0]) - right_primer.length; - - } - } - map_to_nodes(left_primer); - map_to_nodes(right_primer); - Primer_pair primer_pair{left_primer, right_primer, - right_primer.position - left_primer.position + 1}; - primer_pairs.push_back(primer_pair); - } - } - - - void map_to_nodes(Primer& primer) { - string primer_seq; - if (primer.left) { - primer_seq = primer.sequence; - } else { - primer_seq = revcomp(primer.sequence); - } - step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); - handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); - primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); - string cur_node_sequence = graph->get_sequence(cur_node_handle); - size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq) - 1; - while (primer_matched_index < primer_seq.size()-1) { - cur_node_step_handle = graph->get_next_step(cur_node_step_handle); - cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); - primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); - cur_node_sequence = graph->get_sequence(cur_node_handle); - string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); - primer_matched_index += longest_match_len(primer, primer_substr, cur_node_sequence); - } - } - - - void run_test() { - cout << "testing HandleGraph..." << endl; - nid_t min_node_id = graph->min_node_id(); - nid_t max_node_id = graph->max_node_id(); - handle_t min_node = graph->get_handle(min_node_id); - handle_t max_node = graph->get_handle(max_node_id); - cout << "min node id: " << min_node_id << endl; - cout << "sequence: " << graph->get_sequence(min_node) << endl; - cout << "max node id: " << max_node_id << endl; - cout << "sequence: " << graph->get_sequence(max_node) << endl; - cout << "HandleGraph works! :)" << endl; - - cout << "-------------------------------------" << endl; - - cout << "testing PathHandleGraph..." << endl; - cout << graph->get_path_count() << endl; - cout << "path with name y exists: " << graph->has_path("y") << endl; - cout << "reference path has " << graph->get_step_count(reference_path_handle) << - " node steps" << endl; - cout << "PathHandleGraph works! :)" << endl; - - cout << "-------------------------------------" << endl; - - cout << "testing PathPositionHandleGraph..." << endl; - cout << "referecne path length: " << graph->get_path_length(reference_path_handle) << endl; - step_handle_t step_handle_lprimer = graph->get_step_at_position(reference_path_handle, 362); // 362 is the position of the left primer of the first primer pair - handle_t handle_lprimer = graph->get_handle_of_step(step_handle_lprimer); - cout << "left primer sequence begin node seq: " << graph->get_sequence(handle_lprimer) << endl; - step_handle_t step_handle_lprimer_next = graph->get_next_step(step_handle_lprimer); - handle_t handle_lprimer_next = graph->get_handle_of_step(step_handle_lprimer_next); - cout << "left priemr sequence second node seq: " << graph->get_sequence(handle_lprimer_next) << endl; - - // Get node id for position 0 - step_handle_t position_0_step_handle = graph->get_step_at_position(reference_path_handle, 0); - handle_t position_0_handle = graph->get_handle_of_step(position_0_step_handle); - cout << "node id for position 0 " << graph->get_id(position_0_handle) << endl; - // Get node id for position 1 - step_handle_t position_1_step_handle = graph->get_step_at_position(reference_path_handle, 1); - handle_t position_1_handle = graph->get_handle_of_step(position_1_step_handle); - cout << "node id for position 1 " << graph->get_id(position_1_handle) << endl; - - // Get node id for position 31 - step_handle_t position_31_step_handle = graph->get_step_at_position(reference_path_handle, 31); - handle_t position_31_handle = graph->get_handle_of_step(position_31_step_handle); - cout << "node id for position 31 " << graph->get_id(position_31_handle) << endl; - - // Get node id for position 32 - step_handle_t position_32_step_handle = graph->get_step_at_position(reference_path_handle, 32); - handle_t position_32_handle = graph->get_handle_of_step(position_32_step_handle); - cout << "node id for position 32 " << graph->get_id(position_32_handle) << endl; - - cout << "PathPositionHandleGraph works! :)" << endl; - - cout << "-------------------------------------" << endl; - - cout << "testing SnarlDistanceIndex..." << endl; - net_handle_t root_node = distance_index->get_root(); - cout << "is root a root? " << distance_index->is_root(root_node) << endl; - cout << "is root a node? " << distance_index->is_node(root_node) << endl; - cout << "is root a snarl? " << distance_index->is_snarl(root_node) << endl; - cout << "is root a chain? " << distance_index->is_chain(root_node) << endl; - cout << "depth of root is: " << distance_index->get_depth(root_node) << endl; - - net_handle_t min_node_net_handle = distance_index->get_net(min_node, graph); - cout << "depth of min node is: " << distance_index->get_depth(min_node_net_handle) << endl; - cout << "make sure that min node net handle is a node: " << distance_index->is_node(min_node_net_handle) << endl; - - size_t min_dist_12_17 = distance_index->minimum_distance(12, false, 1, 17, false, 2); - size_t max_dist_12_17 = distance_index->maximum_distance(12, false, 3, 17, false, 3); - - cout << "min dist between node 12 and node 17: " << min_dist_12_17 << endl; - cout << "max dist between node 12 and node 17: " << max_dist_12_17 << endl; - - cout << "SnarlDistanceIndex works! :)" << endl; - cout << "-------------------------------------" << endl; - - cout << "testing load_primers..." << endl; - for (vector::iterator it = primer_pairs.begin(); it != primer_pairs.end(); ++it) { - Primer left_primer = it->left_primer; - Primer right_primer = it->right_primer; - cout << "product size: " << it->linear_product_size << endl; - cout << left_primer.left << " " << left_primer.position << " " << - left_primer.length << " " << left_primer.sequence << endl; - for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { - size_t cur_node_id = left_primer.mapped_nodes_ids[i]; - handle_t cur_node_handle = graph->get_handle(cur_node_id); - cout << graph->get_sequence(cur_node_handle) << " "; - } - cout << endl; - cout << right_primer.left << " " << right_primer.position << " " << - right_primer.length << " " << right_primer.sequence << " " << revcomp(right_primer.sequence) << endl; - for (int i = 0; i < right_primer.mapped_nodes_ids.size(); i++) { - size_t cur_node_id = right_primer.mapped_nodes_ids[i]; - handle_t cur_node_handle = graph->get_handle(cur_node_id); - cout << graph->get_sequence(cur_node_handle) << " "; - } - cout << endl; - cout << endl; - } - cout << "load_primers works! :)" << endl; - cout << "-------------------------------------" << endl; - - } - // void filter_primer() { - - //} - - private: - // Functions only used in load_primers().. Not sure where to put them for now - string rstrip(string const s) { - const string WHITESPACE = " \n\r\t\f\v"; - size_t end = s.find_last_not_of(WHITESPACE); - if (end == string::npos) { - return ""; - } - return s.substr(0, end+1); - } - - size_t longest_match_len(Primer& primer, string const left_seq, string const right_seq) { - size_t llen = left_seq.size(), rlen = right_seq.size(); - size_t length = min(llen, rlen); - size_t longest_match = 0; - - // Change .. can be done in one for loop - if (llen >= rlen) { - for (size_t i = 0; i <= llen - rlen; i++) { - if (left_seq.substr(i, rlen) == right_seq) { - longest_match = rlen; - return longest_match; - } - } - } - - for (size_t i = 1; i <= length; i++) { - if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { - longest_match = i; - } - } - - return longest_match; - } - - char complement(char nt) { - switch(nt) { - case 'A': return 'T'; - case 'C': return 'G'; - case 'G': return 'C'; - case 'T': return 'A'; - } - return 'N'; - } - - string revcomp(string const seq) { - string revcomp_seq; - for (int i = seq.size()-1; i >= 0; i--) { - revcomp_seq += complement(seq[i]); - } - return revcomp_seq; - } - - vector split(string str, string const delim) { - size_t cur_pos = 0; - string word; - vector word_list; - while ((cur_pos = str.find(delim)) != string::npos) { - word = str.substr(0, cur_pos); - word_list.push_back(word); - str.erase(0, cur_pos + delim.length()); - } - word = str; - word_list.push_back(word); - return word_list; - } - }; - - SnarlDistanceIndex distance_index; - //unique_ptr graph; - unique_ptr graph; - string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; - string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; - distance_index.deserialize(snarl_index_path); - //graph = vg::io::VPKG::load_one(xg_graph_path); - graph = vg::io::VPKG::load_one(xg_graph_path); - - //Primer_finder primer_finder; - Primer_finder primer_finder(graph, "y", &distance_index); - primer_finder.load_primers(primers_path); - primer_finder.run_test(); - - - - - // SnarlDistanceIndex distance_index; - - // string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.dist"; - // string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.xg"; - - // distance_index.deserialize(snarl_index_path); - // unique_ptr graph = vg::io::VPKG::load_one(xg_graph_path); - - // net_handle_t node = distance_index.get_node_net_handle(1); - // cout << distance_index.net_handle_as_string(node) << endl; - - // net_handle_t root_node = distance_index.get_root(); - // cout << "root: " << distance_index.net_handle_as_string(root_node) << endl; - - // cout << distance_index.is_root(node) << endl; - // cout << distance_index.is_snarl(node) << endl; - // cout << distance_index.is_chain(node) << endl; - // cout << distance_index.is_node(node) << endl; - // cout << "--------------------------------------------" << endl; - // cout << graph->get_node_count() << endl; - // cout << graph->get_edge_count() << endl; - // nid_t min_node_id = graph->min_node_id(); - // cout << graph->has_node(min_node_id) << endl; - // nid_t max_node_id = graph->max_node_id(); - // cout << min_node_id << endl; - // cout << max_node_id << endl; - // cout << "--------------------------------------------" << endl; - // handle_t min_node = graph->get_handle(min_node_id); - // cout << graph->get_sequence(min_node) << endl; - // handle_t max_node = graph->get_handle(max_node_id); - // cout << graph->get_sequence(max_node) << endl; - +namespace unittest { + +using namespace std; +using namespace Primer_finder; + + TEST_CASE( "filter simple primers", + "[primers]" ) { + + SnarlDistanceIndex distance_index; + unique_ptr graph; + string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; + string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; + distance_index.deserialize(snarl_index_path); + graph = vg::io::VPKG::load_one(xg_graph_path); + Primer_finder primer_finder(graph, "y", &distance_index); + primer_finder.load_primers(primers_path); + + SECTION("Loads the correct number of primer pairs") { + REQUIRE(primer_finder.get_primer_pairs().size() == 5); + } + SECTION("Loads in the sequences correctly") { + vector primer_pairs = primer_finder.get_primer_pairs(); + REQUIRED(primer_pairs[0].left_primer.sequence == 'TGCCTGGCATAGAGGAAAGC'); + REQUIRED(primer_pairs[0].left_primer.position == 362); + REQUIRED(primer_pairs[0].left_primer.length == 20); + REQUIRED(primer_pairs[0].right_primer.sequence == 'GCCAGAAGAGCCTCAAGGAG'); + REQUIRED(primer_pairs[0].right_primer.position == 466); + REQUIRED(primer_pairs[0].right_primer.length == 20); } } - -} \ No newline at end of file +} +} \ No newline at end of file diff --git a/src/unittest/primers.cpp b/src/unittest/primers.cpp new file mode 100644 index 00000000000..d640267a10b --- /dev/null +++ b/src/unittest/primers.cpp @@ -0,0 +1,499 @@ +// +// primers.cpp +// +// Unit tests for primer filter +// + +#include +#include +#include +#include +#include +#include +#include +#include "vg/io/json2pb.h" +#include +#include "catch.hpp" +#include "random_graph.hpp" +#include "randomness.hpp" +#include "../snarl_distance_index.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../genotypekit.hpp" +#include "../traversal_finder.hpp" +#include +#include +#include "xg.hpp" + +namespace vg { + namespace unittest { + + TEST_CASE( "filter simple primers", + "[primers]" ) { + + struct Primer { + string sequence; + bool left = true; + size_t position; + size_t length; + size_t offset; + vector mapped_nodes_ids; + }; + + struct Primer_pair { + Primer left_primer; + Primer right_primer; + size_t linear_product_size; + size_t min_product_size; + size_t max_product_size; + }; + + class Primer_finder { + private: + vector primer_pairs; + PathPositionHandleGraph* graph; + SnarlDistanceIndex* distance_index; + path_handle_t reference_path_handle; + vector selected_primer_pairs; + + public: + Primer_finder() = default; + Primer_finder( + //unique_ptr& graph_param, + unique_ptr& graph_param, + string reference_path_name, + SnarlDistanceIndex* distance_index_param) { + graph = graph_param.get(); + reference_path_handle = graph->get_path_handle("y"); + distance_index = distance_index_param; + } + ~Primer_finder() = default; + + + void make_primer(Primer& primer, size_t starting_node_id, size_t offset, size_t length, bool is_left) { + if (is_left) { + primer.left = true; + } else { + primer.left = false; + } + primer.length = length; + string sequence = ""; + handle_t cur_handle = graph->get_handle(starting_node_id); + step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; + primer.position = graph->get_position_of_step(cur_step_handle) + offset; + if (graph->get_length(cur_handle) - offset > length) { + sequence += graph->get_sequence(cur_handle).substr(offset, length); + } else { + sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); + while (sequence.size() < length) { + cur_step_handle = graph->get_next_step(cur_step_handle); + cur_handle = graph->get_handle_of_step(cur_step_handle); + sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); + } + } + + if (is_left) { + primer.sequence = sequence; + } else { + primer.sequence = revcomp(sequence); + } + map_to_nodes(primer); + } + + // starting position relative to the starting node + Primer_pair add_primer_pair(size_t left_primer_starting_node_id, + size_t left_primer_offset, size_t left_primer_length, + size_t right_primer_starting_node_id, + size_t right_primer_offset, size_t right_primer_length) + { + Primer left_primer; + Primer right_primer; + make_primer(left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); + make_primer(right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); + Primer_pair new_primer_pair {left_primer, right_primer, + right_primer.position - left_primer.position + right_primer.length}; + primer_pairs.push_back(new_primer_pair); + if (no_variation(new_primer_pair)) { + selected_primer_pairs.push_back(new_primer_pair); + } + + return new_primer_pair; + } + + + void load_primers(string path_to_primers) { + regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); + regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); + regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); + regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); + + Primer left_primer {""}; + Primer right_primer {"", false}; + + ifstream file_handle(path_to_primers); + if (file_handle.is_open()) { + string line; + while (getline(file_handle, line)) { + line = rstrip(line); + smatch match; + if (regex_search(line, match, left_seq_pattern)) { + if (right_primer.sequence != "") { + map_to_nodes(left_primer); + map_to_nodes(right_primer); + Primer_pair primer_pair {left_primer, right_primer, + right_primer.position - left_primer.position + right_primer.length}; + primer_pairs.push_back(primer_pair); + if (no_variation(primer_pair)) { + selected_primer_pairs.push_back(primer_pair); + } + left_primer = {""}; + right_primer = {"", false}; + } + left_primer.sequence = match[1]; + } else if (regex_search(line, match, right_seq_pattern)) { + right_primer.sequence = match[1]; + } else if (regex_search(line, match, left_pos_pattern)) { + vector pos_and_len = split(match[1], ","); + left_primer.position = stoi(pos_and_len[0]); + left_primer.length = stoi(pos_and_len[1]); + } else if (regex_search(line, match, right_pos_pattern)) { + vector pos_and_len = split(match[1], ","); + right_primer.length = stoi(pos_and_len[1]); + right_primer.position = stoi(pos_and_len[0]) - right_primer.length + 1; + + } + } + map_to_nodes(left_primer); + map_to_nodes(right_primer); + Primer_pair primer_pair {left_primer, right_primer, + right_primer.position - left_primer.position + right_primer.length}; + if (no_variation(primer_pair)) { + selected_primer_pairs.push_back(primer_pair); + } + primer_pairs.push_back(primer_pair); + } + } + + + void map_to_nodes(Primer& primer) { + string primer_seq; + if (primer.left) { + primer_seq = primer.sequence; + } else { + primer_seq = revcomp(primer.sequence); + } + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); + handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); + string cur_node_sequence = graph->get_sequence(cur_node_handle); + size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq, true) - 1; + while (primer_matched_index < primer_seq.size()-1) { + cur_node_step_handle = graph->get_next_step(cur_node_step_handle); + cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); + cur_node_sequence = graph->get_sequence(cur_node_handle); + string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); + primer_matched_index += longest_match_len(primer, primer_substr, cur_node_sequence, false); + } + } + + void run_test() { + cout << "testing HandleGraph..." << endl; + nid_t min_node_id = graph->min_node_id(); + nid_t max_node_id = graph->max_node_id(); + handle_t min_node = graph->get_handle(min_node_id); + handle_t max_node = graph->get_handle(max_node_id); + cout << "min node id: " << min_node_id << endl; + cout << "sequence: " << graph->get_sequence(min_node) << endl; + cout << "max node id: " << max_node_id << endl; + cout << "sequence: " << graph->get_sequence(max_node) << endl; + cout << "HandleGraph works! :)" << endl; + + cout << "-------------------------------------" << endl; + + cout << "testing PathHandleGraph..." << endl; + cout << graph->get_path_count() << endl; + cout << "path with name y exists: " << graph->has_path("y") << endl; + cout << "reference path has " << graph->get_step_count(reference_path_handle) << + " node steps" << endl; + cout << "PathHandleGraph works! :)" << endl; + + cout << "-------------------------------------" << endl; + + cout << "testing PathPositionHandleGraph..." << endl; + cout << "referecne path length: " << graph->get_path_length(reference_path_handle) << endl; + step_handle_t step_handle_lprimer = graph->get_step_at_position(reference_path_handle, 362); // 362 is the position of the left primer of the first primer pair + handle_t handle_lprimer = graph->get_handle_of_step(step_handle_lprimer); + cout << "left primer sequence begin node seq: " << graph->get_sequence(handle_lprimer) << endl; + step_handle_t step_handle_lprimer_next = graph->get_next_step(step_handle_lprimer); + handle_t handle_lprimer_next = graph->get_handle_of_step(step_handle_lprimer_next); + cout << "left priemr sequence second node seq: " << graph->get_sequence(handle_lprimer_next) << endl; + + // Get node id for position 0 + step_handle_t position_0_step_handle = graph->get_step_at_position(reference_path_handle, 0); + handle_t position_0_handle = graph->get_handle_of_step(position_0_step_handle); + cout << "node id for position 0 " << graph->get_id(position_0_handle) << endl; + // Get node id for position 1 + step_handle_t position_1_step_handle = graph->get_step_at_position(reference_path_handle, 1); + handle_t position_1_handle = graph->get_handle_of_step(position_1_step_handle); + cout << "node id for position 1 " << graph->get_id(position_1_handle) << endl; + + // Get node id for position 31 + step_handle_t position_31_step_handle = graph->get_step_at_position(reference_path_handle, 31); + handle_t position_31_handle = graph->get_handle_of_step(position_31_step_handle); + cout << "node id for position 31 " << graph->get_id(position_31_handle) << endl; + + // Get node id for position 32 + step_handle_t position_32_step_handle = graph->get_step_at_position(reference_path_handle, 32); + handle_t position_32_handle = graph->get_handle_of_step(position_32_step_handle); + cout << "node id for position 32 " << graph->get_id(position_32_handle) << endl; + + cout << "PathPositionHandleGraph works! :)" << endl; + + cout << "-------------------------------------" << endl; + + cout << "testing SnarlDistanceIndex..." << endl; + net_handle_t root_node = distance_index->get_root(); + cout << "is root a root? " << distance_index->is_root(root_node) << endl; + cout << "is root a node? " << distance_index->is_node(root_node) << endl; + cout << "is root a snarl? " << distance_index->is_snarl(root_node) << endl; + cout << "is root a chain? " << distance_index->is_chain(root_node) << endl; + cout << "depth of root is: " << distance_index->get_depth(root_node) << endl; + + net_handle_t min_node_net_handle = distance_index->get_net(min_node, graph); + cout << "depth of min node is: " << distance_index->get_depth(min_node_net_handle) << endl; + cout << "make sure that min node net handle is a node: " << distance_index->is_node(min_node_net_handle) << endl; + + size_t min_dist_12_17 = distance_index->minimum_distance(12, false, 1, 17, false, 2); + size_t max_dist_12_17 = distance_index->maximum_distance(12, false, 3, 17, false, 3); + + cout << "min dist between node 12 and node 17: " << min_dist_12_17 << endl; + cout << "max dist between node 12 and node 17: " << max_dist_12_17 << endl; + + cout << "SnarlDistanceIndex works! :)" << endl; + cout << "-------------------------------------" << endl; + + cout << "Testing PathPositionHandleGraph..." << endl; + string left_primer_sequence; + handle_t starting_handle = graph->get_handle(3); + size_t left_primer_offset = 1; + size_t left_primer_length = 10; + step_handle_t starting_step_handle = graph->steps_of_handle(starting_handle)[0]; + left_primer_sequence += graph->get_sequence(starting_handle).substr(left_primer_offset, + graph->get_length(starting_handle) - left_primer_offset); + cout << left_primer_sequence << endl; + + + + + cout << "-------------------------------------" << endl; + + cout << "testing load_primers..." << endl; + add_primer_pair(3, 1, 10, 8, 3, 15); + add_primer_pair(9, 14, 20, 22, 0, 20); + for (vector::iterator it = primer_pairs.begin(); it != primer_pairs.end(); ++it) { + Primer left_primer = it->left_primer; + Primer right_primer = it->right_primer; + cout << "offsets: " << left_primer.offset << " " << right_primer.offset << endl; + cout << "product size: " << it->linear_product_size << endl; + + size_t min_product_size = distance_index->minimum_distance(left_primer.mapped_nodes_ids[0], + false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], + false, right_primer.offset); + cout << "minimum product size: " << min_product_size << endl; + + size_t max_product_size = distance_index->maximum_distance(left_primer.mapped_nodes_ids[0], + false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], + false, right_primer.offset); + cout << "maximum product size: " << max_product_size << endl; + + cout << left_primer.left << " " << left_primer.position << " " << + left_primer.length << " " << left_primer.sequence << endl; + for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { + size_t cur_node_id = left_primer.mapped_nodes_ids[i]; + handle_t cur_node_handle = graph->get_handle(cur_node_id); + cout << graph->get_sequence(cur_node_handle) << " "; + } + cout << endl; + cout << right_primer.left << " " << right_primer.position << " " << + right_primer.length << " " << right_primer.sequence << " " << revcomp(right_primer.sequence) << endl; + for (int i = 0; i < right_primer.mapped_nodes_ids.size(); i++) { + size_t cur_node_id = right_primer.mapped_nodes_ids[i]; + handle_t cur_node_handle = graph->get_handle(cur_node_id); + cout << graph->get_sequence(cur_node_handle) << " "; + } + cout << endl; + cout << endl; + } + + for (vector::iterator it = selected_primer_pairs.begin(); it != selected_primer_pairs.end(); ++it) { + Primer left_primer = it->left_primer; + Primer right_primer = it->right_primer; + cout << left_primer.sequence << " - " << right_primer.sequence << endl; + } + + cout << endl; + cout << "load_primers works! :)" << endl; + cout << "-------------------------------------" << endl; + + } + // void filter_primer() { + + //} + + private: + // Functions only used in load_primers().. Not sure where to put them for now + string rstrip(string const s) { + const string WHITESPACE = " \n\r\t\f\v"; + size_t end = s.find_last_not_of(WHITESPACE); + if (end == string::npos) { + return ""; + } + return s.substr(0, end+1); + } + + bool no_variation(const Primer_pair& primer_pair) { + Primer left_primer = primer_pair.left_primer; + Primer right_primer = primer_pair.right_primer; + for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { + handle_t cur_handle = graph->get_handle(*node_id); + net_handle_t cur_net_handle = distance_index->get_net(cur_handle, graph); + size_t depth = distance_index->get_depth(cur_net_handle); + if (depth != 1) { + return false; + } + } + return true; + } + size_t longest_match_len(Primer& primer, string const left_seq, string const right_seq, bool const first_node) { + size_t llen = left_seq.size(), rlen = right_seq.size(); + size_t length = min(llen, rlen); + size_t longest_match = 0; + + // Change .. can be done in one for loop + if (first_node && llen >= rlen) { + for (size_t i = 0; i <= llen - rlen; i++) { + if (left_seq.substr(i, rlen) == right_seq) { + longest_match = rlen; + if (primer.left) { + primer.offset = i; + } else { + primer.offset = i + primer.sequence.size(); + } + return longest_match; + } + } + } + + if (first_node) { + for (size_t i = 1; i <= length; i++) { + if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { + longest_match = i; + if (primer.left && first_node) { + primer.offset = llen - i; + } else if (!primer.left) { + primer.offset = i; + } + } + } + } else { + for (size_t i = 1; i <= length; i++) { + if (left_seq.substr(0, i) == right_seq.substr(0, i)) { + longest_match = i; + if (!primer.left) { + primer.offset = i; + } + } + } + } + + return longest_match; + } + + char complement(char nt) { + switch(nt) { + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + case 'T': return 'A'; + } + return 'N'; + } + + string revcomp(string const seq) { + string revcomp_seq; + for (int i = seq.size()-1; i >= 0; i--) { + revcomp_seq += complement(seq[i]); + } + return revcomp_seq; + } + + vector split(string str, string const delim) { + size_t cur_pos = 0; + string word; + vector word_list; + while ((cur_pos = str.find(delim)) != string::npos) { + word = str.substr(0, cur_pos); + word_list.push_back(word); + str.erase(0, cur_pos + delim.length()); + } + word = str; + word_list.push_back(word); + return word_list; + } + }; + + SnarlDistanceIndex distance_index; + unique_ptr graph; + string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; + string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; + distance_index.deserialize(snarl_index_path); + //graph = vg::io::VPKG::load_one(xg_graph_path); + graph = vg::io::VPKG::load_one(xg_graph_path); + + //Primer_finder primer_finder; + Primer_finder primer_finder(graph, "y", &distance_index); + primer_finder.load_primers(primers_path); + primer_finder.run_test(); + + + + + // SnarlDistanceIndex distance_index; + + // string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.dist"; + // string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.xg"; + + // distance_index.deserialize(snarl_index_path); + // unique_ptr graph = vg::io::VPKG::load_one(xg_graph_path); + + // net_handle_t node = distance_index.get_node_net_handle(1); + // cout << distance_index.net_handle_as_string(node) << endl; + + // net_handle_t root_node = distance_index.get_root(); + // cout << "root: " << distance_index.net_handle_as_string(root_node) << endl; + + // cout << distance_index.is_root(node) << endl; + // cout << distance_index.is_snarl(node) << endl; + // cout << distance_index.is_chain(node) << endl; + // cout << distance_index.is_node(node) << endl; + // cout << "--------------------------------------------" << endl; + // cout << graph->get_node_count() << endl; + // cout << graph->get_edge_count() << endl; + // nid_t min_node_id = graph->min_node_id(); + // cout << graph->has_node(min_node_id) << endl; + // nid_t max_node_id = graph->max_node_id(); + // cout << min_node_id << endl; + // cout << max_node_id << endl; + // cout << "--------------------------------------------" << endl; + // handle_t min_node = graph->get_handle(min_node_id); + // cout << graph->get_sequence(min_node) << endl; + // handle_t max_node = graph->get_handle(max_node_id); + // cout << graph->get_sequence(max_node) << endl; + + + } + + } + +} \ No newline at end of file From 02dac080cc9f23ffbe7b6c76bc238db7aab4c0b4 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Sun, 15 Oct 2023 18:55:27 -0700 Subject: [PATCH 06/43] update --- src/primer_filter.cpp | 46 ++++++++++++----------- src/primer_filter.hpp | 7 ++++ src/unittest/primer_filter.cpp | 67 +++++++++++++++++++++++++++++----- 3 files changed, 90 insertions(+), 30 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index d772cacb6e2..56a3bb7b110 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -33,6 +33,7 @@ void Primer_finder::add_primer_pair(size_t left_primer_starting_node_id, make_primer(right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); Primer_pair new_primer_pair {left_primer, right_primer, right_primer.position - left_primer.position + right_primer.length}; + update_min_max_product_size(new_primer_pair); primer_pairs.push_back(new_primer_pair); if (no_variation(new_primer_pair)) { selected_primer_pairs.push_back(new_primer_pair); @@ -61,6 +62,7 @@ void Primer_finder::load_primers(string path_to_primers) { map_to_nodes(right_primer); Primer_pair primer_pair {left_primer, right_primer, right_primer.position - left_primer.position + right_primer.length}; + update_min_max_product_size(primer_pair); primer_pairs.push_back(primer_pair); if (no_variation(primer_pair)) { selected_primer_pairs.push_back(primer_pair); @@ -88,6 +90,7 @@ void Primer_finder::load_primers(string path_to_primers) { map_to_nodes(right_primer); Primer_pair primer_pair {left_primer, right_primer, right_primer.position - left_primer.position + right_primer.length}; + update_min_max_product_size(primer_pair); if (no_variation(primer_pair)) { selected_primer_pairs.push_back(primer_pair); } @@ -124,6 +127,19 @@ void Primer_finder::make_primer(Primer& primer, size_t starting_node_id, size_t map_to_nodes(primer); } +void Primer_finder::update_min_max_product_size(Primer_pair& primer_pair) { + Primer left_primer = primer_pair.left_primer; + Primer right_primer = primer_pair.right_primer; + + primer_pair.min_product_size = distance_index->minimum_distance(left_primer.mapped_nodes_ids[0], + false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], + false, right_primer.offset); + + primer_pair.max_product_size = distance_index->maximum_distance(left_primer.mapped_nodes_ids[0], + false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], + false, right_primer.offset); +} + void Primer_finder::map_to_nodes(Primer& primer) { string primer_seq; if (primer.left) { @@ -152,39 +168,27 @@ size_t Primer_finder::longest_match_len(Primer& primer, string const left_seq, s size_t length = min(llen, rlen); size_t longest_match = 0; - // Change .. can be done in one for loop - if (first_node && llen >= rlen) { - for (size_t i = 0; i <= llen - rlen; i++) { - if (left_seq.substr(i, rlen) == right_seq) { - longest_match = rlen; - if (primer.left) { - primer.offset = i; - } else { - primer.offset = i + primer.sequence.size(); + if (first_node) { + if (llen >= rlen) { + for (size_t i = 0; i <= llen - rlen; i++) { + if (left_seq.substr(i, rlen) == right_seq) { + longest_match = rlen; + primer.offset = (primer.left) ? i : i + primer.sequence.size(); + return longest_match; } - return longest_match; } } - } - - if (first_node) { for (size_t i = 1; i <= length; i++) { if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { longest_match = i; - if (primer.left && first_node) { - primer.offset = llen - i; - } else if (!primer.left) { - primer.offset = i; - } + primer.offset = (primer.left && first_node) ? llen - i : i; } } } else { for (size_t i = 1; i <= length; i++) { if (left_seq.substr(0, i) == right_seq.substr(0, i)) { longest_match = i; - if (!primer.left) { - primer.offset = i; - } + primer.offset = (!primer.left) ? i : primer.offset; } } } diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index 73aaf487881..c88f2d259ba 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -101,6 +101,13 @@ class Primer_finder { * Private functions used by public or private functions. */ + /** + * Update minimum and maximum prodcut to a primer pair object. + * Used in: add_primer_pair + * load_primers + */ + void update_min_max_product_size(Primer_pair& primer_pair); + /** * Update a Primer object given starting node id, offset relative to the starting node, * and the length of primer. diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 05cf7a1d578..eb28b53db91 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -29,10 +29,9 @@ namespace vg { namespace unittest { using namespace std; -using namespace Primer_finder; TEST_CASE( "filter simple primers", - "[primers]" ) { + "[primer_filter]" ) { SnarlDistanceIndex distance_index; unique_ptr graph; @@ -48,14 +47,64 @@ using namespace Primer_finder; REQUIRE(primer_finder.get_primer_pairs().size() == 5); } - SECTION("Loads in the sequences correctly") { + SECTION("Loads and processes the primers correctly") { + primer_finder.add_primer_pair(9, 14, 20, 22, 0, 20); vector primer_pairs = primer_finder.get_primer_pairs(); - REQUIRED(primer_pairs[0].left_primer.sequence == 'TGCCTGGCATAGAGGAAAGC'); - REQUIRED(primer_pairs[0].left_primer.position == 362); - REQUIRED(primer_pairs[0].left_primer.length == 20); - REQUIRED(primer_pairs[0].right_primer.sequence == 'GCCAGAAGAGCCTCAAGGAG'); - REQUIRED(primer_pairs[0].right_primer.position == 466); - REQUIRED(primer_pairs[0].right_primer.length == 20); + vector selected_primer_pairs = primer_finder.get_selected_primer_pairs(); + Primer_pair pair_0 = primer_pairs[0]; // 1st set of primers read from primer3 output. No variation in either primers. + Primer_pair pair_5 = primer_pairs[5]; // made up set of primers. Variation in both priemrs. + + SECTION("Check for basic primer attributes") { + REQUIRE(pair_0.left_primer.sequence == "TGCCTGGCATAGAGGAAAGC"); + REQUIRE(pair_0.left_primer.position == 362); + REQUIRE(pair_0.left_primer.length == 20); + REQUIRE(pair_0.right_primer.sequence == "GCCAGAAGAGCCTCAAGGAG"); + REQUIRE(pair_0.right_primer.position == 466); + REQUIRE(pair_0.right_primer.length == 20); + REQUIRE(pair_5.left_primer.sequence == "AGCCAGACAAATCTGGGTTC"); + REQUIRE(pair_5.left_primer.position == 181); + REQUIRE(pair_5.left_primer.length == 20); + REQUIRE(pair_5.right_primer.sequence == "AGATAATTAAACTGAAGTTC"); + REQUIRE(pair_5.right_primer.position == 260); + REQUIRE(pair_5.right_primer.length == 20); + } + + SECTION("Check for minimum and maximum distance") { + REQUIRE(pair_0.linear_product_size == 124); + REQUIRE(pair_0.min_product_size == 124); + REQUIRE(pair_0.max_product_size == 124); + REQUIRE(pair_5.linear_product_size == 99); + REQUIRE(pair_5.min_product_size == 97); + REQUIRE(pair_5.max_product_size == 100); + } + + SECTION("Check that primers are mapped to correct nodes") { + vector pair_0_left_primer_nodes {27, 8}; + for (size_t i = 0; i < pair_0.left_primer.mapped_nodes_ids.size()-1; i++) { + REQUIRE(pair_0.left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); + } + + vector pair_0_right_primer_nodes {33, 34}; + for (size_t i = 0; i < pair_0.right_primer.mapped_nodes_ids.size()-1; i++) { + REQUIRE(pair_0.right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); + } + + vector pair_5_left_primer_nodes {9, 11, 12, 14, 15, 17}; + for (size_t i = 0; i < pair_5.left_primer.mapped_nodes_ids.size()-1; i++) { + REQUIRE(pair_5.left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); + } + + vector pair_5_right_primer_nodes {22, 24, 25}; + for (size_t i = 0; i < pair_5.right_primer.mapped_nodes_ids.size()-1; i++) { + REQUIRE(pair_5.right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); + } + } + + SECTION("Check for variation at primer sites") { + REQUIRE(primer_pairs.size() == 6); + REQUIRE(selected_primer_pairs.size() == 5); + } + } } From 57016df8126a8886a6f2012f6fc99e2da0c4794d Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Mon, 16 Oct 2023 11:10:57 -0700 Subject: [PATCH 07/43] update something --- src/primer_filter.cpp | 7 +++++-- src/primer_filter.hpp | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 56a3bb7b110..d4fe97d8933 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -34,10 +34,11 @@ void Primer_finder::add_primer_pair(size_t left_primer_starting_node_id, Primer_pair new_primer_pair {left_primer, right_primer, right_primer.position - left_primer.position + right_primer.length}; update_min_max_product_size(new_primer_pair); - primer_pairs.push_back(new_primer_pair); if (no_variation(new_primer_pair)) { + new_primer_pair.no_variation = true; selected_primer_pairs.push_back(new_primer_pair); } + primer_pairs.push_back(new_primer_pair); } void Primer_finder::load_primers(string path_to_primers) { @@ -63,10 +64,11 @@ void Primer_finder::load_primers(string path_to_primers) { Primer_pair primer_pair {left_primer, right_primer, right_primer.position - left_primer.position + right_primer.length}; update_min_max_product_size(primer_pair); - primer_pairs.push_back(primer_pair); if (no_variation(primer_pair)) { + primer_pair.no_variation = true; selected_primer_pairs.push_back(primer_pair); } + primer_pairs.push_back(primer_pair); left_primer = {""}; right_primer = {"", false}; } @@ -92,6 +94,7 @@ void Primer_finder::load_primers(string path_to_primers) { right_primer.position - left_primer.position + right_primer.length}; update_min_max_product_size(primer_pair); if (no_variation(primer_pair)) { + primer_pair.no_variation = true; selected_primer_pairs.push_back(primer_pair); } primer_pairs.push_back(primer_pair); diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index c88f2d259ba..54827590e23 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -42,6 +42,7 @@ struct Primer_pair { size_t linear_product_size; size_t min_product_size; size_t max_product_size; + bool no_variation = false; }; class Primer_finder { From fe3df0649bcf6f7e50d3394347e10db4ed82811b Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Mon, 16 Oct 2023 16:31:20 -0700 Subject: [PATCH 08/43] more optimized --- src/primer_filter.cpp | 64 ++++++++++++---------------------- src/primer_filter.hpp | 61 ++++++++++++++++---------------- src/unittest/primer_filter.cpp | 8 ++--- 3 files changed, 56 insertions(+), 77 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index d4fe97d8933..81c94cb1e8d 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -4,8 +4,8 @@ namespace vg { using namespace std; -Primer_finder::Primer_finder( unique_ptr& graph_param, - string reference_path_name, SnarlDistanceIndex* distance_index_param) { +Primer_finder::Primer_finder(const unique_ptr& graph_param, + const string& reference_path_name, const SnarlDistanceIndex* distance_index_param) { graph = graph_param.get(); reference_path_handle = graph->get_path_handle("y"); distance_index = distance_index_param; @@ -15,18 +15,18 @@ Primer_finder::~Primer_finder() { // nothing to do } -vector Primer_finder::get_primer_pairs() { +const vector& Primer_finder::get_primer_pairs() const { return primer_pairs; } -vector Primer_finder::get_selected_primer_pairs() { +const vector& Primer_finder::get_selected_primer_pairs() const { return selected_primer_pairs; } -void Primer_finder::add_primer_pair(size_t left_primer_starting_node_id, - size_t left_primer_offset, size_t left_primer_length, - size_t right_primer_starting_node_id, - size_t right_primer_offset, size_t right_primer_length) { +void Primer_finder::add_primer_pair(const size_t& left_primer_starting_node_id, + const size_t& left_primer_offset, const size_t& left_primer_length, + const size_t& right_primer_starting_node_id, + const size_t& right_primer_offset, const size_t& right_primer_length) { Primer left_primer; Primer right_primer; make_primer(left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); @@ -41,7 +41,7 @@ void Primer_finder::add_primer_pair(size_t left_primer_starting_node_id, primer_pairs.push_back(new_primer_pair); } -void Primer_finder::load_primers(string path_to_primers) { +void Primer_finder::load_primers(const string& path_to_primers) { regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); @@ -76,11 +76,11 @@ void Primer_finder::load_primers(string path_to_primers) { } else if (regex_search(line, match, right_seq_pattern)) { right_primer.sequence = match[1]; } else if (regex_search(line, match, left_pos_pattern)) { - vector pos_and_len = split(match[1], ","); + const vector pos_and_len = split(match[1], ","); left_primer.position = stoi(pos_and_len[0]); left_primer.length = stoi(pos_and_len[1]); } else if (regex_search(line, match, right_pos_pattern)) { - vector pos_and_len = split(match[1], ","); + const vector pos_and_len = split(match[1], ","); right_primer.length = stoi(pos_and_len[1]); right_primer.position = stoi(pos_and_len[0]) - right_primer.length + 1; @@ -100,7 +100,8 @@ void Primer_finder::load_primers(string path_to_primers) { primer_pairs.push_back(primer_pair); } -void Primer_finder::make_primer(Primer& primer, size_t starting_node_id, size_t offset, size_t length, bool is_left) { +void Primer_finder::make_primer(Primer& primer, const size_t& starting_node_id, + const size_t& offset, const size_t& length, const bool& is_left) { if (is_left) { primer.left = true; } else { @@ -125,14 +126,14 @@ void Primer_finder::make_primer(Primer& primer, size_t starting_node_id, size_t if (is_left) { primer.sequence = sequence; } else { - primer.sequence = revcomp(sequence); + primer.sequence = reverse_complement(sequence); } map_to_nodes(primer); } void Primer_finder::update_min_max_product_size(Primer_pair& primer_pair) { - Primer left_primer = primer_pair.left_primer; - Primer right_primer = primer_pair.right_primer; + const Primer& left_primer = primer_pair.left_primer; + const Primer& right_primer = primer_pair.right_primer; primer_pair.min_product_size = distance_index->minimum_distance(left_primer.mapped_nodes_ids[0], false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], @@ -148,7 +149,7 @@ void Primer_finder::map_to_nodes(Primer& primer) { if (primer.left) { primer_seq = primer.sequence; } else { - primer_seq = revcomp(primer.sequence); + primer_seq = reverse_complement(primer.sequence); } step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); @@ -165,8 +166,8 @@ void Primer_finder::map_to_nodes(Primer& primer) { } } -size_t Primer_finder::longest_match_len(Primer& primer, string const left_seq, string const right_seq, - bool const first_node) { +size_t Primer_finder::longest_match_len(Primer& primer, const string& left_seq, + const string& right_seq, const bool& first_node) { size_t llen = left_seq.size(), rlen = right_seq.size(); size_t length = min(llen, rlen); size_t longest_match = 0; @@ -199,7 +200,7 @@ size_t Primer_finder::longest_match_len(Primer& primer, string const left_seq, s return longest_match; } -string Primer_finder::rstrip(string const s) { +const string Primer_finder::rstrip(const string& s) const { const string WHITESPACE = " \n\r\t\f\v"; size_t end = s.find_last_not_of(WHITESPACE); if (end == string::npos) { @@ -209,7 +210,7 @@ string Primer_finder::rstrip(string const s) { } -bool Primer_finder::no_variation(const Primer_pair& primer_pair) { +const bool Primer_finder::no_variation(const Primer_pair& primer_pair) const { Primer left_primer = primer_pair.left_primer; Primer right_primer = primer_pair.right_primer; for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { @@ -223,28 +224,7 @@ bool Primer_finder::no_variation(const Primer_pair& primer_pair) { return true; } - -char Primer_finder::complement(char nt) { - switch(nt) { - case 'A': return 'T'; - case 'C': return 'G'; - case 'G': return 'C'; - case 'T': return 'A'; - } - return 'N'; -} - - -string Primer_finder::revcomp(string const seq) { - string revcomp_seq; - for (int i = seq.size()-1; i >= 0; i--) { - revcomp_seq += complement(seq[i]); - } - return revcomp_seq; -} - - -vector Primer_finder::split(string str, string const delim) { +const vector Primer_finder::split(string str, const string& delim) const { size_t cur_pos = 0; string word; vector word_list; diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index 54827590e23..a85b3355643 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -16,6 +16,7 @@ #include #include #include +#include "utility.hpp" #include "snarl_distance_index.hpp" #include "integrated_snarl_finder.hpp" #include "genotypekit.hpp" @@ -27,6 +28,12 @@ using namespace std; namespace vg { +/** + * Primer struct contains primer attributes, including sequence, left/right primer, + * position on the reference genome, length, index offset in corresponding node on + * sequence graph, and vector of corresponding nodes on the sequence graph. Everything + * is in the positive/forward orientation. + */ struct Primer { string sequence; bool left = true; @@ -36,6 +43,11 @@ struct Primer { vector mapped_nodes_ids; }; +/** + * Primer_pair struct contains primer pair attributesm including left primer, right primer, + * linear product size, minimum and maximum product size on the sequence graph, and boolean on + * whether the primers locate in low variation region of the sequence graph. + */ struct Primer_pair { Primer left_primer; Primer right_primer; @@ -50,8 +62,8 @@ class Primer_finder { private: vector primer_pairs; vector selected_primer_pairs; - PathPositionHandleGraph* graph; - SnarlDistanceIndex* distance_index; + const PathPositionHandleGraph* graph; + const SnarlDistanceIndex* distance_index; path_handle_t reference_path_handle; public: @@ -61,8 +73,8 @@ class Primer_finder { * Construct Primer finder given PathPositionHandleGraph, reference graph name * and pointer to SnarlDistanceIndex */ - Primer_finder(unique_ptr& graph_param, - string reference_path_name, SnarlDistanceIndex* distance_index_param); + Primer_finder(const unique_ptr& graph_param, + const string& reference_path_name, const SnarlDistanceIndex* distance_index_param); /** * Destructor @@ -75,27 +87,27 @@ class Primer_finder { * primer_pair object is automatically added to primer_pairs vector - and * selected_primer_pairs if conditions are met. Mainly used for unit testing. */ - void add_primer_pair(size_t left_primer_starting_node_id, - size_t left_primer_offset, size_t left_primer_length, - size_t right_primer_starting_node_id, - size_t right_primer_offset, size_t right_primer_length); + void add_primer_pair(const size_t& left_primer_starting_node_id, + const size_t& left_primer_offset, const size_t& left_primer_length, + const size_t& right_primer_starting_node_id, + const size_t& right_primer_offset, const size_t& right_primer_length); /** * Read the path to the primer3 output. Primers information is parsed, * processed, and stored in primer_pairs vector - and selected_primer_pairs * if conditions are met. */ - void load_primers(string path_to_primers); + void load_primers(const string& path_to_primers); /** * return vector of Primer pairs */ - vector get_primer_pairs(); + const vector& get_primer_pairs() const; /** * return vector selected primer pairs */ - vector get_selected_primer_pairs(); + const vector& get_selected_primer_pairs() const; private: /** @@ -114,7 +126,8 @@ class Primer_finder { * and the length of primer. * Used in: add_primer_pair */ - void make_primer(Primer& primer, size_t starting_node_id, size_t offset, size_t length, bool is_left); + void make_primer(Primer& primer, const size_t& starting_node_id, + const size_t& offset, const size_t& length, const bool& is_left); /** * Find and store corresponding node ids to Primer object. @@ -128,41 +141,27 @@ class Primer_finder { * store offset in Primer object. * Used in: map_to_nodes */ - size_t longest_match_len(Primer& primer, string const left_seq, string const right_seq, - bool const first_node); + size_t longest_match_len(Primer& primer, const string& left_seq, const string& right_seq, + const bool& first_node); /** * Strip empty spaces on the right side of a string. * Used in: load_primers */ - string rstrip(string const s); + const string rstrip(const string& s) const; /** * Check if primers in a primer_pair object have variations on the pangenome. * Used in: add_primer_node * load_primers */ - bool no_variation(const Primer_pair& primer_pair); - - /** - * return the complement of a nucleotide. - * Used in: revcomp - */ - char complement(char nt); - - /** - * return the reverse complement of a sequence. - * Used in: make_primers - * map_to_nodes - * - */ - string revcomp(string const seq); + const bool no_variation(const Primer_pair& primer_pair) const; /** * Split a string into vectors. * Used in: load_priemrs */ - vector split(string str, string const delim); + const vector split(string str, const string& delim) const; }; diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index eb28b53db91..873955fbcb6 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -49,10 +49,10 @@ using namespace std; SECTION("Loads and processes the primers correctly") { primer_finder.add_primer_pair(9, 14, 20, 22, 0, 20); - vector primer_pairs = primer_finder.get_primer_pairs(); - vector selected_primer_pairs = primer_finder.get_selected_primer_pairs(); - Primer_pair pair_0 = primer_pairs[0]; // 1st set of primers read from primer3 output. No variation in either primers. - Primer_pair pair_5 = primer_pairs[5]; // made up set of primers. Variation in both priemrs. + const vector& primer_pairs = primer_finder.get_primer_pairs(); + const vector& selected_primer_pairs = primer_finder.get_selected_primer_pairs(); + const Primer_pair& pair_0 = primer_pairs[0]; // 1st set of primers read from primer3 output. No variation in either primers. + const Primer_pair& pair_5 = primer_pairs[5]; // made up set of primers. Variation in both priemrs. SECTION("Check for basic primer attributes") { REQUIRE(pair_0.left_primer.sequence == "TGCCTGGCATAGAGGAAAGC"); From 6a2f0bf4189b067f4ceea0d120c8ae10469eac3f Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Mon, 16 Oct 2023 16:55:50 -0700 Subject: [PATCH 09/43] added comments --- src/primer_filter.cpp | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 81c94cb1e8d..6eaa8285a64 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -4,6 +4,7 @@ namespace vg { using namespace std; +// Constructor Primer_finder::Primer_finder(const unique_ptr& graph_param, const string& reference_path_name, const SnarlDistanceIndex* distance_index_param) { graph = graph_param.get(); @@ -11,6 +12,7 @@ Primer_finder::Primer_finder(const unique_ptr& Primer_finder::get_selected_primer_pairs() const { return selected_primer_pairs; } + +// Make a new pair of primers with given attributes. Primers are processed and +// added to primer_pairs and selected_primer_pairs. void Primer_finder::add_primer_pair(const size_t& left_primer_starting_node_id, const size_t& left_primer_offset, const size_t& left_primer_length, const size_t& right_primer_starting_node_id, @@ -42,9 +47,12 @@ void Primer_finder::add_primer_pair(const size_t& left_primer_starting_node_id, } void Primer_finder::load_primers(const string& path_to_primers) { - regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); + + // regular expression patterns to look for primers' sequences, positions on + // the reference genome, and lengths + regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); // e.g. PRIMER_LEFT_0_SEQUENCE=ACCGT regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); - regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); + regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); // e.g. PRIMER_LEFT_0_=125,20 regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); Primer left_primer {""}; @@ -59,6 +67,7 @@ void Primer_finder::load_primers(const string& path_to_primers) { smatch match; if (regex_search(line, match, left_seq_pattern)) { if (right_primer.sequence != "") { + // primers' attributes are processed and stored into primer_pairs here map_to_nodes(left_primer); map_to_nodes(right_primer); Primer_pair primer_pair {left_primer, right_primer, @@ -109,9 +118,10 @@ void Primer_finder::make_primer(Primer& primer, const size_t& starting_node_id, } primer.length = length; string sequence = ""; - handle_t cur_handle = graph->get_handle(starting_node_id); + handle_t cur_handle = graph->get_handle(starting_node_id); // get the starting node handle step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; primer.position = graph->get_position_of_step(cur_step_handle) + offset; + // Walk down the path and get the sequence of primer if (graph->get_length(cur_handle) - offset > length) { sequence += graph->get_sequence(cur_handle).substr(offset, length); } else { @@ -126,9 +136,9 @@ void Primer_finder::make_primer(Primer& primer, const size_t& starting_node_id, if (is_left) { primer.sequence = sequence; } else { - primer.sequence = reverse_complement(sequence); + primer.sequence = reverse_complement(sequence); // Take the reverse complement for right primer } - map_to_nodes(primer); + map_to_nodes(primer); // Search and store corresponding nodes ids } void Primer_finder::update_min_max_product_size(Primer_pair& primer_pair) { @@ -155,7 +165,11 @@ void Primer_finder::map_to_nodes(Primer& primer) { handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); string cur_node_sequence = graph->get_sequence(cur_node_handle); + // Get the index at which primer.sequence[0:index] maps to the first node. + // Stop here if the first node contains the entire primer sequence size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq, true) - 1; + // If the first node containly a prefix of primer sequence, walk down the path and keep adding + // node until the entire primer sequence is covered while (primer_matched_index < primer_seq.size()-1) { cur_node_step_handle = graph->get_next_step(cur_node_step_handle); cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); @@ -174,6 +188,7 @@ size_t Primer_finder::longest_match_len(Primer& primer, const string& left_seq, if (first_node) { if (llen >= rlen) { + // Check if the first node contains the entire sequence of the priemr for (size_t i = 0; i <= llen - rlen; i++) { if (left_seq.substr(i, rlen) == right_seq) { longest_match = rlen; @@ -183,6 +198,7 @@ size_t Primer_finder::longest_match_len(Primer& primer, const string& left_seq, } } for (size_t i = 1; i <= length; i++) { + // Find the length of match between first node sequence's suffix and primer sequnece's prefix if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { longest_match = i; primer.offset = (primer.left && first_node) ? llen - i : i; @@ -190,6 +206,7 @@ size_t Primer_finder::longest_match_len(Primer& primer, const string& left_seq, } } else { for (size_t i = 1; i <= length; i++) { + // Find the length of match between downstream nodes seqeunces and primer sequence if (left_seq.substr(0, i) == right_seq.substr(0, i)) { longest_match = i; primer.offset = (!primer.left) ? i : primer.offset; @@ -214,6 +231,7 @@ const bool Primer_finder::no_variation(const Primer_pair& primer_pair) const { Primer left_primer = primer_pair.left_primer; Primer right_primer = primer_pair.right_primer; for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { + // Check if any node has depth more than 1 (i.e. inside a bubble) handle_t cur_handle = graph->get_handle(*node_id); net_handle_t cur_net_handle = distance_index->get_net(cur_handle, graph); size_t depth = distance_index->get_depth(cur_net_handle); @@ -225,6 +243,7 @@ const bool Primer_finder::no_variation(const Primer_pair& primer_pair) const { } const vector Primer_finder::split(string str, const string& delim) const { + // Works like python split() function size_t cur_pos = 0; string word; vector word_list; From 0037f147ba4e78572f198927d2c382df168cfe33 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Mon, 16 Oct 2023 19:59:10 -0700 Subject: [PATCH 10/43] unittest --- src/unittest/primers.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/unittest/primers.cpp b/src/unittest/primers.cpp index d640267a10b..1ebe15088a1 100644 --- a/src/unittest/primers.cpp +++ b/src/unittest/primers.cpp @@ -27,7 +27,7 @@ namespace vg { namespace unittest { - TEST_CASE( "filter simple primers", + TEST_CASE( "temporary place to write code in for primers", "[primers]" ) { struct Primer { @@ -448,7 +448,6 @@ namespace vg { string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; distance_index.deserialize(snarl_index_path); - //graph = vg::io::VPKG::load_one(xg_graph_path); graph = vg::io::VPKG::load_one(xg_graph_path); //Primer_finder primer_finder; From 92e38de5ac51e08734a1e3c40845350ced705e9a Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Mon, 16 Oct 2023 23:26:32 -0700 Subject: [PATCH 11/43] start on interface --- src/subcommand/primers_main.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/subcommand/primers_main.cpp diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp new file mode 100644 index 00000000000..7e4916986d7 --- /dev/null +++ b/src/subcommand/primers_main.cpp @@ -0,0 +1,30 @@ +#include + +#include +#include + +#include + +#include "../primer_filter.hpp" +#include "../snarl_distance_index.hpp" + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_primers(char** argv) { + cerr << "usage: " << argv[0] << " primers [options] input.primer3 > filtered_primers.out" << endl + << endl + << "options:" << endl + << " -z, --zero-variance Allow no variance in the product" << endl + << " -l, --tolerance INT Allow this much difference between minimum and maximum sizes compared to the linear product size" << endl + << " -n, --minimum-size INT Minimum product size allowed (has precedence over --tolerance)" << endl + << " -m, --maximum-size INT Maximum product size allowed (has precedence over --tolerance)" << endl; +} + +int main_primers(int argc, char** argv) { + + if (argc == 2) { + help_priemrs(argv) + } +} \ No newline at end of file From 2ba2521e9cbbdfad29477cd36619d2c96a1b4a09 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Tue, 17 Oct 2023 11:04:18 -0700 Subject: [PATCH 12/43] update --- src/subcommand/primers_main.cpp | 102 ++++++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 6 deletions(-) diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 7e4916986d7..229aa8cdd47 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -16,15 +16,105 @@ void help_primers(char** argv) { cerr << "usage: " << argv[0] << " primers [options] input.primer3 > filtered_primers.out" << endl << endl << "options:" << endl - << " -z, --zero-variance Allow no variance in the product" << endl - << " -l, --tolerance INT Allow this much difference between minimum and maximum sizes compared to the linear product size" << endl - << " -n, --minimum-size INT Minimum product size allowed (has precedence over --tolerance)" << endl - << " -m, --maximum-size INT Maximum product size allowed (has precedence over --tolerance)" << endl; + << " -x, --xg-path FILE use this xg graph" << endl + << " -s, --snarl-index FILE use this snarl index" << endl + << " -z, --zero-variance allow no variance in the product" << endl + << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl + << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl + << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl; } int main_primers(int argc, char** argv) { if (argc == 2) { - help_priemrs(argv) + help_primers(argv); + return 1; } -} \ No newline at end of file + + string xg_path; + string snarl_index_path; + bool zero_variance = false; + int tolerance = 20; + int minimum_product_size = numeric_limits::max(); + int maximum_product_size = numeric_limits::max(); + + int c; + optind = 2; + + while (true) { + static struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"xg-path", required_argument, 0, 'x'}, + {"snarl-index", required_argument, 0, 's'}, + {"zero-variance", required_argument, 0, 'z'}, + {"tolerance", required_argument, 0, 'l'}, + {"minimum-size", required_argument, 0, 'n'}, + {"maximum-size", required_argument, 0, 'm'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "hx:s:zl:n:m:", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) break; + + switch (c) + { + case 'x': + xg_path = optarg; + break; + + case 's': + snarl_index_path = optarg; + break; + + case 'z': + zero_variance = true; + break; + + case 'l': + tolerance = parse(optarg); + break; + + case 'n': + minimum_product_size = parse(optarg); + break; + + case 'm': + maximum_product_size = parse(optarg); + break; + + case 'h': + case '?': + help_primers(argv); + exit(1); + break; + + default: + abort (); + } + } + + if (xg_path.empty()) { + cerr << "error:[vg primers] xg file (-x) is required" << endl; + exit(1); + } + + if (snarl_index_path.empty()) { + cerr << "error:[vg priemrs] snarl index file (-s) is required" << endl; + exit(1); + } + + string file_name = get_input_file_name(optind, argc, argv); + + cout << "primer file name: " << file_name << endl + << "xg file name: " << xg_path << endl + << "snarl index file name: " << snarl_index_path << endl; + + return 0; +} + +static Subcommand vg_primers("primers", "filter primers for low variation", main_primers); \ No newline at end of file From aace099cb5ff599e0319a5ef92c56a112a0c4768 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Tue, 17 Oct 2023 16:41:44 -0700 Subject: [PATCH 13/43] update --- src/primer_filter.cpp | 113 +++++++++++++++++---------------- src/primer_filter.hpp | 34 +++++----- src/unittest/primer_filter.cpp | 10 +-- 3 files changed, 79 insertions(+), 78 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 6eaa8285a64..fd7352bcda1 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -5,48 +5,50 @@ namespace vg { using namespace std; // Constructor -Primer_finder::Primer_finder(const unique_ptr& graph_param, +PrimerFinder::PrimerFinder(const unique_ptr& graph_param, const string& reference_path_name, const SnarlDistanceIndex* distance_index_param) { graph = graph_param.get(); - reference_path_handle = graph->get_path_handle("y"); + reference_path_handle = graph->get_path_handle(reference_path_name); distance_index = distance_index_param; } // Destructor -Primer_finder::~Primer_finder() { +PrimerFinder::~PrimerFinder() { // nothing to do } -const vector& Primer_finder::get_primer_pairs() const { +const vector& PrimerFinder::get_primer_pairs() const { return primer_pairs; } -const vector& Primer_finder::get_selected_primer_pairs() const { +const vector& PrimerFinder::get_selected_primer_pairs() const { return selected_primer_pairs; } // Make a new pair of primers with given attributes. Primers are processed and // added to primer_pairs and selected_primer_pairs. -void Primer_finder::add_primer_pair(const size_t& left_primer_starting_node_id, +void PrimerFinder::add_primer_pair(const size_t& left_primer_starting_node_id, const size_t& left_primer_offset, const size_t& left_primer_length, const size_t& right_primer_starting_node_id, const size_t& right_primer_offset, const size_t& right_primer_length) { - Primer left_primer; - Primer right_primer; - make_primer(left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); - make_primer(right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); - Primer_pair new_primer_pair {left_primer, right_primer, - right_primer.position - left_primer.position + right_primer.length}; - update_min_max_product_size(new_primer_pair); - if (no_variation(new_primer_pair)) { - new_primer_pair.no_variation = true; - selected_primer_pairs.push_back(new_primer_pair); + + primer_pairs.emplace_back(); + PrimerPair& primer_pair = primer_pairs.back(); + primer_pair.right_primer.left = false; + + make_primer(primer_pair.left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); + make_primer(primer_pair.right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); + primer_pair.linear_product_size = primer_pair.right_primer.position - primer_pair.left_primer.position + primer_pair.right_primer.length; + update_min_max_product_size(primer_pair); + if (no_variation(primer_pair)) { + primer_pair.no_variation = true; + selected_primer_pairs.push_back(primer_pairs.back()); } - primer_pairs.push_back(new_primer_pair); + } -void Primer_finder::load_primers(const string& path_to_primers) { +void PrimerFinder::load_primers(const string& path_to_primers) { // regular expression patterns to look for primers' sequences, positions on // the reference genome, and lengths @@ -55,9 +57,9 @@ void Primer_finder::load_primers(const string& path_to_primers) { regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); // e.g. PRIMER_LEFT_0_=125,20 regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); - Primer left_primer {""}; - Primer right_primer {"", false}; - + // iniate first primer pair + primer_pairs.emplace_back(); + primer_pairs.back().right_primer.left = false; ifstream file_handle(path_to_primers); assert(file_handle.is_open()); @@ -66,50 +68,48 @@ void Primer_finder::load_primers(const string& path_to_primers) { line = rstrip(line); smatch match; if (regex_search(line, match, left_seq_pattern)) { - if (right_primer.sequence != "") { + if (primer_pairs.back().right_primer.sequence != "") { // primers' attributes are processed and stored into primer_pairs here - map_to_nodes(left_primer); - map_to_nodes(right_primer); - Primer_pair primer_pair {left_primer, right_primer, - right_primer.position - left_primer.position + right_primer.length}; - update_min_max_product_size(primer_pair); - if (no_variation(primer_pair)) { - primer_pair.no_variation = true; - selected_primer_pairs.push_back(primer_pair); + map_to_nodes(primer_pairs.back().left_primer); + map_to_nodes(primer_pairs.back().right_primer); + primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position + - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; + update_min_max_product_size(primer_pairs.back()); + if (no_variation(primer_pairs.back())) { + primer_pairs.back().no_variation = true; + selected_primer_pairs.push_back(primer_pairs.back()); + PrimerPair& pp = primer_pairs.back(); } - primer_pairs.push_back(primer_pair); - left_primer = {""}; - right_primer = {"", false}; + primer_pairs.emplace_back(); + primer_pairs.back().right_primer.left = false; } - left_primer.sequence = match[1]; + primer_pairs.back().left_primer.sequence = match[1]; } else if (regex_search(line, match, right_seq_pattern)) { - right_primer.sequence = match[1]; + primer_pairs.back().right_primer.sequence = match[1]; } else if (regex_search(line, match, left_pos_pattern)) { const vector pos_and_len = split(match[1], ","); - left_primer.position = stoi(pos_and_len[0]); - left_primer.length = stoi(pos_and_len[1]); + primer_pairs.back().left_primer.position = stoi(pos_and_len[0]); + primer_pairs.back().left_primer.length = stoi(pos_and_len[1]); } else if (regex_search(line, match, right_pos_pattern)) { const vector pos_and_len = split(match[1], ","); - right_primer.length = stoi(pos_and_len[1]); - right_primer.position = stoi(pos_and_len[0]) - right_primer.length + 1; - + primer_pairs.back().right_primer.length = stoi(pos_and_len[1]); + primer_pairs.back().right_primer.position = stoi(pos_and_len[0]) - stoi(pos_and_len[1]) + 1; } } // Process and store the last pair of primers - map_to_nodes(left_primer); - map_to_nodes(right_primer); - Primer_pair primer_pair {left_primer, right_primer, - right_primer.position - left_primer.position + right_primer.length}; - update_min_max_product_size(primer_pair); - if (no_variation(primer_pair)) { - primer_pair.no_variation = true; - selected_primer_pairs.push_back(primer_pair); + map_to_nodes(primer_pairs.back().left_primer); + map_to_nodes(primer_pairs.back().right_primer); + primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position + - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; + update_min_max_product_size(primer_pairs.back()); + if (no_variation(primer_pairs.back())) { + primer_pairs.back().no_variation = true; + selected_primer_pairs.push_back(primer_pairs.back()); } - primer_pairs.push_back(primer_pair); } -void Primer_finder::make_primer(Primer& primer, const size_t& starting_node_id, +void PrimerFinder::make_primer(Primer& primer, const size_t& starting_node_id, const size_t& offset, const size_t& length, const bool& is_left) { if (is_left) { primer.left = true; @@ -141,7 +141,7 @@ void Primer_finder::make_primer(Primer& primer, const size_t& starting_node_id, map_to_nodes(primer); // Search and store corresponding nodes ids } -void Primer_finder::update_min_max_product_size(Primer_pair& primer_pair) { +void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { const Primer& left_primer = primer_pair.left_primer; const Primer& right_primer = primer_pair.right_primer; @@ -154,13 +154,14 @@ void Primer_finder::update_min_max_product_size(Primer_pair& primer_pair) { false, right_primer.offset); } -void Primer_finder::map_to_nodes(Primer& primer) { +void PrimerFinder::map_to_nodes(Primer& primer) { string primer_seq; if (primer.left) { primer_seq = primer.sequence; } else { primer_seq = reverse_complement(primer.sequence); } + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); @@ -180,7 +181,7 @@ void Primer_finder::map_to_nodes(Primer& primer) { } } -size_t Primer_finder::longest_match_len(Primer& primer, const string& left_seq, +size_t PrimerFinder::longest_match_len(Primer& primer, const string& left_seq, const string& right_seq, const bool& first_node) { size_t llen = left_seq.size(), rlen = right_seq.size(); size_t length = min(llen, rlen); @@ -217,7 +218,7 @@ size_t Primer_finder::longest_match_len(Primer& primer, const string& left_seq, return longest_match; } -const string Primer_finder::rstrip(const string& s) const { +const string PrimerFinder::rstrip(const string& s) const { const string WHITESPACE = " \n\r\t\f\v"; size_t end = s.find_last_not_of(WHITESPACE); if (end == string::npos) { @@ -227,7 +228,7 @@ const string Primer_finder::rstrip(const string& s) const { } -const bool Primer_finder::no_variation(const Primer_pair& primer_pair) const { +const bool PrimerFinder::no_variation(const PrimerPair& primer_pair) const { Primer left_primer = primer_pair.left_primer; Primer right_primer = primer_pair.right_primer; for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { @@ -242,7 +243,7 @@ const bool Primer_finder::no_variation(const Primer_pair& primer_pair) const { return true; } -const vector Primer_finder::split(string str, const string& delim) const { +const vector PrimerFinder::split(string str, const string& delim) const { // Works like python split() function size_t cur_pos = 0; string word; diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index a85b3355643..5ebf377a72c 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -37,9 +37,9 @@ namespace vg { struct Primer { string sequence; bool left = true; - size_t position; - size_t length; - size_t offset; + size_t position = numeric_limits::max(); + size_t length = numeric_limits::max(); + size_t offset = numeric_limits::max(); vector mapped_nodes_ids; }; @@ -48,38 +48,38 @@ struct Primer { * linear product size, minimum and maximum product size on the sequence graph, and boolean on * whether the primers locate in low variation region of the sequence graph. */ -struct Primer_pair { +struct PrimerPair { Primer left_primer; Primer right_primer; - size_t linear_product_size; - size_t min_product_size; - size_t max_product_size; + size_t linear_product_size = numeric_limits::max(); + size_t min_product_size = numeric_limits::max(); + size_t max_product_size = numeric_limits::max(); bool no_variation = false; }; -class Primer_finder { +class PrimerFinder { private: - vector primer_pairs; - vector selected_primer_pairs; + vector primer_pairs; + vector selected_primer_pairs; const PathPositionHandleGraph* graph; const SnarlDistanceIndex* distance_index; path_handle_t reference_path_handle; public: - Primer_finder() = default; + PrimerFinder() = default; /** * Construct Primer finder given PathPositionHandleGraph, reference graph name * and pointer to SnarlDistanceIndex */ - Primer_finder(const unique_ptr& graph_param, + PrimerFinder(const unique_ptr& graph_param, const string& reference_path_name, const SnarlDistanceIndex* distance_index_param); /** * Destructor */ - ~Primer_finder(); + ~PrimerFinder(); /** * Add a Primer_pair object given primers' starting node id, offset relative @@ -102,12 +102,12 @@ class Primer_finder { /** * return vector of Primer pairs */ - const vector& get_primer_pairs() const; + const vector& get_primer_pairs() const; /** * return vector selected primer pairs */ - const vector& get_selected_primer_pairs() const; + const vector& get_selected_primer_pairs() const; private: /** @@ -119,7 +119,7 @@ class Primer_finder { * Used in: add_primer_pair * load_primers */ - void update_min_max_product_size(Primer_pair& primer_pair); + void update_min_max_product_size(PrimerPair& primer_pair); /** * Update a Primer object given starting node id, offset relative to the starting node, @@ -155,7 +155,7 @@ class Primer_finder { * Used in: add_primer_node * load_primers */ - const bool no_variation(const Primer_pair& primer_pair) const; + const bool no_variation(const PrimerPair& primer_pair) const; /** * Split a string into vectors. diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 873955fbcb6..85299d92c19 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -40,7 +40,7 @@ using namespace std; string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); - Primer_finder primer_finder(graph, "y", &distance_index); + PrimerFinder primer_finder(graph, "y", &distance_index); primer_finder.load_primers(primers_path); SECTION("Loads the correct number of primer pairs") { @@ -49,10 +49,10 @@ using namespace std; SECTION("Loads and processes the primers correctly") { primer_finder.add_primer_pair(9, 14, 20, 22, 0, 20); - const vector& primer_pairs = primer_finder.get_primer_pairs(); - const vector& selected_primer_pairs = primer_finder.get_selected_primer_pairs(); - const Primer_pair& pair_0 = primer_pairs[0]; // 1st set of primers read from primer3 output. No variation in either primers. - const Primer_pair& pair_5 = primer_pairs[5]; // made up set of primers. Variation in both priemrs. + const vector& primer_pairs = primer_finder.get_primer_pairs(); + const vector& selected_primer_pairs = primer_finder.get_selected_primer_pairs(); + const PrimerPair& pair_0 = primer_pairs[0]; // 1st set of primers read from primer3 output. No variation in either primers. + const PrimerPair& pair_5 = primer_pairs[5]; // made up set of primers. Variation in both priemrs. SECTION("Check for basic primer attributes") { REQUIRE(pair_0.left_primer.sequence == "TGCCTGGCATAGAGGAAAGC"); From 5ce857c4fbb6a1d26cc1e386c9069c5876f6acd3 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Tue, 17 Oct 2023 21:35:32 -0700 Subject: [PATCH 14/43] multiple reference paths --- src/primer_filter.cpp | 239 +++++++++++++++++++++++++----------------- src/primer_filter.hpp | 35 ++++--- 2 files changed, 162 insertions(+), 112 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index fd7352bcda1..2c75d9eaa18 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -6,9 +6,8 @@ using namespace std; // Constructor PrimerFinder::PrimerFinder(const unique_ptr& graph_param, - const string& reference_path_name, const SnarlDistanceIndex* distance_index_param) { + const SnarlDistanceIndex* distance_index_param) { graph = graph_param.get(); - reference_path_handle = graph->get_path_handle(reference_path_name); distance_index = distance_index_param; } @@ -17,13 +16,10 @@ PrimerFinder::~PrimerFinder() { // nothing to do } -const vector& PrimerFinder::get_primer_pairs() const { - return primer_pairs; +const vector& PrimerFinder::get_primer_pairs(const string) const { + return chroms[string]; } -const vector& PrimerFinder::get_selected_primer_pairs() const { - return selected_primer_pairs; -} // Make a new pair of primers with given attributes. Primers are processed and @@ -50,97 +46,142 @@ void PrimerFinder::add_primer_pair(const size_t& left_primer_starting_node_id, void PrimerFinder::load_primers(const string& path_to_primers) { - // regular expression patterns to look for primers' sequences, positions on - // the reference genome, and lengths - regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); // e.g. PRIMER_LEFT_0_SEQUENCE=ACCGT - regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); - regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); // e.g. PRIMER_LEFT_0_=125,20 - regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); - - // iniate first primer pair - primer_pairs.emplace_back(); - primer_pairs.back().right_primer.left = false; ifstream file_handle(path_to_primers); assert(file_handle.is_open()); + vector cur_fields; + string cur_path; string line; while (getline(file_handle, line)) { - line = rstrip(line); - smatch match; - if (regex_search(line, match, left_seq_pattern)) { - if (primer_pairs.back().right_primer.sequence != "") { - // primers' attributes are processed and stored into primer_pairs here - map_to_nodes(primer_pairs.back().left_primer); - map_to_nodes(primer_pairs.back().right_primer); - primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position - - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; - update_min_max_product_size(primer_pairs.back()); - if (no_variation(primer_pairs.back())) { - primer_pairs.back().no_variation = true; - selected_primer_pairs.push_back(primer_pairs.back()); - PrimerPair& pp = primer_pairs.back(); - } - primer_pairs.emplace_back(); - primer_pairs.back().right_primer.left = false; + line = strip(line); + size_t left_primer_line_start = line.find("LEFT PRIMER"); + size_t right_primer_line_start = line.find("RIGHT PRIMER"); + + if (startswith(line, "PRIMER PICKING RESULTS FOR")) { + if (chroms.size() != 0) { + assert(chroms[cur_path].back().right_primer.sequence.empty()); + chroms[cur_path].pop_back(); } - primer_pairs.back().left_primer.sequence = match[1]; - } else if (regex_search(line, match, right_seq_pattern)) { - primer_pairs.back().right_primer.sequence = match[1]; - } else if (regex_search(line, match, left_pos_pattern)) { - const vector pos_and_len = split(match[1], ","); - primer_pairs.back().left_primer.position = stoi(pos_and_len[0]); - primer_pairs.back().left_primer.length = stoi(pos_and_len[1]); - } else if (regex_search(line, match, right_pos_pattern)) { - const vector pos_and_len = split(match[1], ","); - primer_pairs.back().right_primer.length = stoi(pos_and_len[1]); - primer_pairs.back().right_primer.position = stoi(pos_and_len[0]) - stoi(pos_and_len[1]) + 1; + cur_fields = move(split(line)); + cur_path = fields[fields.size()-1]; + chroms[cur_path].emplace_back(); + chroms[cur_path].back().right_primer.left = false; + } else if (left_primer_line_start != string::npos) { + cur_fields = move(split(line.substr(left_primer_line_start, line.size()-1))); + PrimerPair& primer_pair = chroms[cur_path].back(); + primer_pair.left_primer.sequence = cur_fields[9]; + primer_pair.left_primer.position = stoi(cur_fields[2]); + primer_pair.left_primer.length = stoi(cur_fields[3]); + } else if (startswith(line, "RIGHT PRIMER")) { + cur_fields = move(split(line.substr(left_primer_line_start, line.size()-1))); + PrimerPair& primer_pair = chroms[cur_path].back(); + primer_pair.right_primer.sequence = cur_fields[9]; + primer_pair.right_primer.position = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1; + primer_pair.right_primer.length = stoi(cur_fields[3]); + + assert(!primer_pair.left_primer.sequence.empty()); + map_to_nodes(primer_pair.left_primer, cur_path); + map_to_nodes(primer_pair.right_primer, cur_path); + primer_pair.linear_product_size = primer_pair.right_primer.position + - primer_pair.left_primer.position + primer_pair.right_primer.length; + update_min_max_product_size(primer_pair); + if (no_variation(primer_pair)) { + primer_pair.no_variation = true; + } + + chroms[cur_path].emplace_back(); + chroms[cur_path].back().right_primer.left = false; } + assert(chroms[cur_path].back().right_primer.sequence.empty()); + chroms[cur_path].pop_back(); + // line = rstrip(line); + // smatch match; + // if (regex_search(line, match, left_seq_pattern)) { + // if (primer_pairs.back().right_primer.sequence != "") { + // // primers' attributes are processed and stored into primer_pairs here + // map_to_nodes(primer_pairs.back().left_primer); + // map_to_nodes(primer_pairs.back().right_primer); + // primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position + // - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; + // update_min_max_product_size(primer_pairs.back()); + // if (no_variation(primer_pairs.back())) { + // primer_pairs.back().no_variation = true; + // selected_primer_pairs.push_back(primer_pairs.back()); + // PrimerPair& pp = primer_pairs.back(); + // } + // primer_pairs.emplace_back(); + // primer_pairs.back().right_primer.left = false; + // } + // primer_pairs.back().left_primer.sequence = match[1]; + // } else if (regex_search(line, match, right_seq_pattern)) { + // primer_pairs.back().right_primer.sequence = match[1]; + // } else if (regex_search(line, match, left_pos_pattern)) { + // const vector pos_and_len = split(match[1], ","); + // primer_pairs.back().left_primer.position = stoi(pos_and_len[0]); + // primer_pairs.back().left_primer.length = stoi(pos_and_len[1]); + // } else if (regex_search(line, match, right_pos_pattern)) { + // const vector pos_and_len = split(match[1], ","); + // primer_pairs.back().right_primer.length = stoi(pos_and_len[1]); + // primer_pairs.back().right_primer.position = stoi(pos_and_len[0]) - stoi(pos_and_len[1]) + 1; + // } } // Process and store the last pair of primers - map_to_nodes(primer_pairs.back().left_primer); - map_to_nodes(primer_pairs.back().right_primer); - primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position - - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; - update_min_max_product_size(primer_pairs.back()); - if (no_variation(primer_pairs.back())) { - primer_pairs.back().no_variation = true; - selected_primer_pairs.push_back(primer_pairs.back()); - } + // map_to_nodes(primer_pairs.back().left_primer); + // map_to_nodes(primer_pairs.back().right_primer); + // primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position + // - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; + // update_min_max_product_size(primer_pairs.back()); + // if (no_variation(primer_pairs.back())) { + // primer_pairs.back().no_variation = true; + // selected_primer_pairs.push_back(primer_pairs.back()); + // } } -void PrimerFinder::make_primer(Primer& primer, const size_t& starting_node_id, - const size_t& offset, const size_t& length, const bool& is_left) { - if (is_left) { - primer.left = true; - } else { - primer.left = false; - } - primer.length = length; - string sequence = ""; - handle_t cur_handle = graph->get_handle(starting_node_id); // get the starting node handle - step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; - primer.position = graph->get_position_of_step(cur_step_handle) + offset; - // Walk down the path and get the sequence of primer - if (graph->get_length(cur_handle) - offset > length) { - sequence += graph->get_sequence(cur_handle).substr(offset, length); - } else { - sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); - while (sequence.size() < length) { - cur_step_handle = graph->get_next_step(cur_step_handle); - cur_handle = graph->get_handle_of_step(cur_step_handle); - sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); - } - } +const size_t PrimerFinder::total_reference_paths() const { + return chroms.size(); +} - if (is_left) { - primer.sequence = sequence; - } else { - primer.sequence = reverse_complement(sequence); // Take the reverse complement for right primer +vector PrimerFinder::get_reference_paths() { + vector reference_paths; + for (const auto& chrom : chroms) { + reference_paths.push_back(chrom.first); } - map_to_nodes(primer); // Search and store corresponding nodes ids + return reference_paths; } +// void PrimerFinder::make_primer(Primer& primer, const size_t& starting_node_id, +// const size_t& offset, const size_t& length, const bool& is_left) { +// if (is_left) { +// primer.left = true; +// } else { +// primer.left = false; +// } +// primer.length = length; +// string sequence = ""; +// handle_t cur_handle = graph->get_handle(starting_node_id); // get the starting node handle +// step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; +// primer.position = graph->get_position_of_step(cur_step_handle) + offset; +// // Walk down the path and get the sequence of primer +// if (graph->get_length(cur_handle) - offset > length) { +// sequence += graph->get_sequence(cur_handle).substr(offset, length); +// } else { +// sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); +// while (sequence.size() < length) { +// cur_step_handle = graph->get_next_step(cur_step_handle); +// cur_handle = graph->get_handle_of_step(cur_step_handle); +// sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); +// } +// } + +// if (is_left) { +// primer.sequence = sequence; +// } else { +// primer.sequence = reverse_complement(sequence); // Take the reverse complement for right primer +// } +// map_to_nodes(primer); // Search and store corresponding nodes ids +// } + void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { const Primer& left_primer = primer_pair.left_primer; const Primer& right_primer = primer_pair.right_primer; @@ -154,7 +195,8 @@ void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { false, right_primer.offset); } -void PrimerFinder::map_to_nodes(Primer& primer) { +void PrimerFinder::map_to_nodes(Primer& primer, const string& path_name) { + path_handle_t reference_path_handle = graph->get_path_handle(path_name); string primer_seq; if (primer.left) { primer_seq = primer.sequence; @@ -218,13 +260,14 @@ size_t PrimerFinder::longest_match_len(Primer& primer, const string& left_seq, return longest_match; } -const string PrimerFinder::rstrip(const string& s) const { +const string PrimerFinder::strip(const string& s) const { const string WHITESPACE = " \n\r\t\f\v"; - size_t end = s.find_last_not_of(WHITESPACE); + size_t end = s.find_last_not_of(WHITESPACE); + size_t start = s.find_first_not_of(WHITESPACE); if (end == string::npos) { return ""; } - return s.substr(0, end+1); + return s.substr(start, end+1); } @@ -243,19 +286,19 @@ const bool PrimerFinder::no_variation(const PrimerPair& primer_pair) const { return true; } -const vector PrimerFinder::split(string str, const string& delim) const { - // Works like python split() function - size_t cur_pos = 0; - string word; - vector word_list; - while ((cur_pos = str.find(delim)) != string::npos) { - word = str.substr(0, cur_pos); - word_list.push_back(word); - str.erase(0, cur_pos + delim.length()); +vector PrimerFinder::split(const string& str) { + istringstream iss(str); + string field; + vector fields; + + while (iss >> field) { + fields.push_back(field); } - word = str; - word_list.push_back(word); - return word_list; + return fields; +} + +bool PrimerFinder::startswith(const string& str, const string& prefix) { + return str.compare(0, prefix.length(), prefix) == 0; } } \ No newline at end of file diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index 5ebf377a72c..a1ade072130 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -60,8 +60,9 @@ struct PrimerPair { class PrimerFinder { private: - vector primer_pairs; - vector selected_primer_pairs; + // vector primer_pairs; + // vector selected_primer_pairs; + map> chroms; // map containing a vector of primer pairs for each chromosome const PathPositionHandleGraph* graph; const SnarlDistanceIndex* distance_index; path_handle_t reference_path_handle; @@ -74,7 +75,7 @@ class PrimerFinder { * and pointer to SnarlDistanceIndex */ PrimerFinder(const unique_ptr& graph_param, - const string& reference_path_name, const SnarlDistanceIndex* distance_index_param); + const SnarlDistanceIndex* distance_index_param); /** * Destructor @@ -102,12 +103,14 @@ class PrimerFinder { /** * return vector of Primer pairs */ - const vector& get_primer_pairs() const; + const vector& get_primer_pairs(const string chrom) const; /** - * return vector selected primer pairs + * return the total number of reference paths */ - const vector& get_selected_primer_pairs() const; + const size_t total_reference_paths() const; + + vector get_reference_paths(); private: /** @@ -126,15 +129,15 @@ class PrimerFinder { * and the length of primer. * Used in: add_primer_pair */ - void make_primer(Primer& primer, const size_t& starting_node_id, - const size_t& offset, const size_t& length, const bool& is_left); + // void make_primer(Primer& primer, const size_t& starting_node_id, + // const size_t& offset, const size_t& length, const bool& is_left); /** * Find and store corresponding node ids to Primer object. * Used in: make_primer * load_primers */ - void map_to_nodes(Primer& primer); + void map_to_nodes(Primer& primer, const string& path_name); /** * Find the length of the longest match between two sequences. Also find and @@ -145,10 +148,10 @@ class PrimerFinder { const bool& first_node); /** - * Strip empty spaces on the right side of a string. + * Strip empty spaces on the sides of a string. * Used in: load_primers */ - const string rstrip(const string& s) const; + const string strip(const string& s) const; /** * Check if primers in a primer_pair object have variations on the pangenome. @@ -159,10 +162,14 @@ class PrimerFinder { /** * Split a string into vectors. - * Used in: load_priemrs + * Used in: load_primers */ - const vector split(string str, const string& delim) const; - + vector split(const string& str); + /** + * Works like str.startswith(prefix) in python + * Used in: load_primers + */ + bool startswith(const string& str, const string& prefix); }; } From ed2e750f3a1bf1923252b14f90b65e094d7e47eb Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Wed, 18 Oct 2023 16:55:45 -0700 Subject: [PATCH 15/43] beautify --- src/primer_filter.cpp | 192 ++++++++++++++------------------ src/primer_filter.hpp | 29 ++--- src/subcommand/primers_main.cpp | 31 ++++-- src/unittest/primer_filter.cpp | 142 +++++++++++++++-------- 4 files changed, 215 insertions(+), 179 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 2c75d9eaa18..cddfefc16cd 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -1,4 +1,5 @@ #include "primer_filter.hpp" +#include namespace vg { @@ -16,32 +17,26 @@ PrimerFinder::~PrimerFinder() { // nothing to do } -const vector& PrimerFinder::get_primer_pairs(const string) const { - return chroms[string]; +const vector& PrimerFinder::get_primer_pairs_of_chrom(const string chrom_name) const { + return chroms.at(chrom_name); } - - // Make a new pair of primers with given attributes. Primers are processed and // added to primer_pairs and selected_primer_pairs. -void PrimerFinder::add_primer_pair(const size_t& left_primer_starting_node_id, - const size_t& left_primer_offset, const size_t& left_primer_length, - const size_t& right_primer_starting_node_id, - const size_t& right_primer_offset, const size_t& right_primer_length) { +void PrimerFinder::add_primer_pair(const string& path_name, + const size_t& left_primer_starting_node_id, const size_t& left_primer_offset, + const size_t& left_primer_length, const size_t& right_primer_starting_node_id, + const size_t& right_primer_offset, const size_t& right_primer_length) { - primer_pairs.emplace_back(); - PrimerPair& primer_pair = primer_pairs.back(); + chroms.at(path_name).emplace_back(); + PrimerPair& primer_pair = chroms.at(path_name).back(); primer_pair.right_primer.left = false; - make_primer(primer_pair.left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); - make_primer(primer_pair.right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); + make_primer(primer_pair.left_primer, path_name, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); + make_primer(primer_pair.right_primer, path_name, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); primer_pair.linear_product_size = primer_pair.right_primer.position - primer_pair.left_primer.position + primer_pair.right_primer.length; update_min_max_product_size(primer_pair); - if (no_variation(primer_pair)) { - primer_pair.no_variation = true; - selected_primer_pairs.push_back(primer_pairs.back()); - } - + update_variation(primer_pair, path_name); } void PrimerFinder::load_primers(const string& path_to_primers) { @@ -63,79 +58,35 @@ void PrimerFinder::load_primers(const string& path_to_primers) { chroms[cur_path].pop_back(); } cur_fields = move(split(line)); - cur_path = fields[fields.size()-1]; + cur_path = cur_fields[cur_fields.size()-1]; chroms[cur_path].emplace_back(); chroms[cur_path].back().right_primer.left = false; } else if (left_primer_line_start != string::npos) { - cur_fields = move(split(line.substr(left_primer_line_start, line.size()-1))); + cur_fields = move(split(line.substr(left_primer_line_start, line.size()))); PrimerPair& primer_pair = chroms[cur_path].back(); primer_pair.left_primer.sequence = cur_fields[9]; primer_pair.left_primer.position = stoi(cur_fields[2]); primer_pair.left_primer.length = stoi(cur_fields[3]); } else if (startswith(line, "RIGHT PRIMER")) { - cur_fields = move(split(line.substr(left_primer_line_start, line.size()-1))); + cur_fields = move(split(line.substr(right_primer_line_start, line.size()))); PrimerPair& primer_pair = chroms[cur_path].back(); primer_pair.right_primer.sequence = cur_fields[9]; primer_pair.right_primer.position = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1; primer_pair.right_primer.length = stoi(cur_fields[3]); - + assert(!primer_pair.left_primer.sequence.empty()); map_to_nodes(primer_pair.left_primer, cur_path); map_to_nodes(primer_pair.right_primer, cur_path); primer_pair.linear_product_size = primer_pair.right_primer.position - primer_pair.left_primer.position + primer_pair.right_primer.length; update_min_max_product_size(primer_pair); - if (no_variation(primer_pair)) { - primer_pair.no_variation = true; - } - + update_variation(primer_pair, cur_path); chroms[cur_path].emplace_back(); chroms[cur_path].back().right_primer.left = false; } + } assert(chroms[cur_path].back().right_primer.sequence.empty()); chroms[cur_path].pop_back(); - // line = rstrip(line); - // smatch match; - // if (regex_search(line, match, left_seq_pattern)) { - // if (primer_pairs.back().right_primer.sequence != "") { - // // primers' attributes are processed and stored into primer_pairs here - // map_to_nodes(primer_pairs.back().left_primer); - // map_to_nodes(primer_pairs.back().right_primer); - // primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position - // - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; - // update_min_max_product_size(primer_pairs.back()); - // if (no_variation(primer_pairs.back())) { - // primer_pairs.back().no_variation = true; - // selected_primer_pairs.push_back(primer_pairs.back()); - // PrimerPair& pp = primer_pairs.back(); - // } - // primer_pairs.emplace_back(); - // primer_pairs.back().right_primer.left = false; - // } - // primer_pairs.back().left_primer.sequence = match[1]; - // } else if (regex_search(line, match, right_seq_pattern)) { - // primer_pairs.back().right_primer.sequence = match[1]; - // } else if (regex_search(line, match, left_pos_pattern)) { - // const vector pos_and_len = split(match[1], ","); - // primer_pairs.back().left_primer.position = stoi(pos_and_len[0]); - // primer_pairs.back().left_primer.length = stoi(pos_and_len[1]); - // } else if (regex_search(line, match, right_pos_pattern)) { - // const vector pos_and_len = split(match[1], ","); - // primer_pairs.back().right_primer.length = stoi(pos_and_len[1]); - // primer_pairs.back().right_primer.position = stoi(pos_and_len[0]) - stoi(pos_and_len[1]) + 1; - // } - } - - // Process and store the last pair of primers - // map_to_nodes(primer_pairs.back().left_primer); - // map_to_nodes(primer_pairs.back().right_primer); - // primer_pairs.back().linear_product_size = primer_pairs.back().right_primer.position - // - primer_pairs.back().left_primer.position + primer_pairs.back().right_primer.length; - // update_min_max_product_size(primer_pairs.back()); - // if (no_variation(primer_pairs.back())) { - // primer_pairs.back().no_variation = true; - // selected_primer_pairs.push_back(primer_pairs.back()); - // } } const size_t PrimerFinder::total_reference_paths() const { @@ -150,40 +101,42 @@ vector PrimerFinder::get_reference_paths() { return reference_paths; } -// void PrimerFinder::make_primer(Primer& primer, const size_t& starting_node_id, -// const size_t& offset, const size_t& length, const bool& is_left) { -// if (is_left) { -// primer.left = true; -// } else { -// primer.left = false; -// } -// primer.length = length; -// string sequence = ""; -// handle_t cur_handle = graph->get_handle(starting_node_id); // get the starting node handle -// step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; -// primer.position = graph->get_position_of_step(cur_step_handle) + offset; -// // Walk down the path and get the sequence of primer -// if (graph->get_length(cur_handle) - offset > length) { -// sequence += graph->get_sequence(cur_handle).substr(offset, length); -// } else { -// sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); -// while (sequence.size() < length) { -// cur_step_handle = graph->get_next_step(cur_step_handle); -// cur_handle = graph->get_handle_of_step(cur_step_handle); -// sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); -// } -// } +void PrimerFinder::make_primer(Primer& primer, const string& path_name, + const size_t& starting_node_id, const size_t& offset, const size_t& length, + const bool& is_left) { + + if (is_left) { + primer.left = true; + } else { + primer.left = false; + } + primer.length = length; + string sequence = ""; + handle_t cur_handle = graph->get_handle(starting_node_id); // get the starting node handle + step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; + primer.position = graph->get_position_of_step(cur_step_handle) + offset; + // Walk down the path and get the sequence of primer + if (graph->get_length(cur_handle) - offset > length) { + sequence += graph->get_sequence(cur_handle).substr(offset, length); + } else { + sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); + while (sequence.size() < length) { + cur_step_handle = graph->get_next_step(cur_step_handle); + cur_handle = graph->get_handle_of_step(cur_step_handle); + sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); + } + } -// if (is_left) { -// primer.sequence = sequence; -// } else { -// primer.sequence = reverse_complement(sequence); // Take the reverse complement for right primer -// } -// map_to_nodes(primer); // Search and store corresponding nodes ids -// } + if (is_left) { + primer.sequence = sequence; + } else { + primer.sequence = reverse_complement(sequence); // Take the reverse complement for right primer + } + map_to_nodes(primer, path_name); // Search and store corresponding nodes ids +} void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { - const Primer& left_primer = primer_pair.left_primer; + const Primer& left_primer = primer_pair.left_primer; const Primer& right_primer = primer_pair.right_primer; primer_pair.min_product_size = distance_index->minimum_distance(left_primer.mapped_nodes_ids[0], @@ -270,20 +223,43 @@ const string PrimerFinder::strip(const string& s) const { return s.substr(start, end+1); } - -const bool PrimerFinder::no_variation(const PrimerPair& primer_pair) const { - Primer left_primer = primer_pair.left_primer; - Primer right_primer = primer_pair.right_primer; - for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { - // Check if any node has depth more than 1 (i.e. inside a bubble) - handle_t cur_handle = graph->get_handle(*node_id); - net_handle_t cur_net_handle = distance_index->get_net(cur_handle, graph); +void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_name) { + const Primer& left_primer = primer_pair.left_primer; + const Primer& right_primer = primer_pair.right_primer; + + nid_t right_edge_node_id = right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1]; + unordered_set primer_nodes_set; + for (size_t i = 0; i < left_primer.mapped_nodes_ids.size(); ++i) { + primer_nodes_set.insert(left_primer.mapped_nodes_ids[i]); + } + for (size_t i = 0; i < right_primer.mapped_nodes_ids.size(); ++i) { + primer_nodes_set.insert(right_primer.mapped_nodes_ids[i]); + } + + const path_handle_t& reference_path_handle = graph->get_path_handle(path_name); + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, left_primer.position); + handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + net_handle_t cur_net_handle = distance_index->get_net(cur_node_handle, graph); + nid_t cur_node_id = graph->get_id(cur_node_handle); + while (true) { size_t depth = distance_index->get_depth(cur_net_handle); if (depth != 1) { - return false; + if (primer_nodes_set.find(cur_node_id) != primer_nodes_set.end()) { + primer_pair.no_variation_at_primers = false; + primer_pair.no_variation_in_products = false; + break; + } else { + primer_pair.no_variation_in_products = false; + } } + if (cur_node_id == right_edge_node_id) { + break; + } + cur_node_step_handle = graph->get_next_step(cur_node_step_handle); + cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + cur_net_handle = distance_index->get_net(cur_node_handle, graph); + cur_node_id = graph->get_id(cur_node_handle); } - return true; } vector PrimerFinder::split(const string& str) { diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index a1ade072130..39e0be2d13f 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -38,8 +38,8 @@ struct Primer { string sequence; bool left = true; size_t position = numeric_limits::max(); - size_t length = numeric_limits::max(); - size_t offset = numeric_limits::max(); + size_t length = numeric_limits::max(); + size_t offset = numeric_limits::max(); vector mapped_nodes_ids; }; @@ -52,9 +52,10 @@ struct PrimerPair { Primer left_primer; Primer right_primer; size_t linear_product_size = numeric_limits::max(); - size_t min_product_size = numeric_limits::max(); - size_t max_product_size = numeric_limits::max(); - bool no_variation = false; + size_t min_product_size = numeric_limits::max(); + size_t max_product_size = numeric_limits::max(); + bool no_variation_at_primers = true; + bool no_variation_in_products = true; }; class PrimerFinder { @@ -88,10 +89,10 @@ class PrimerFinder { * primer_pair object is automatically added to primer_pairs vector - and * selected_primer_pairs if conditions are met. Mainly used for unit testing. */ - void add_primer_pair(const size_t& left_primer_starting_node_id, - const size_t& left_primer_offset, const size_t& left_primer_length, - const size_t& right_primer_starting_node_id, - const size_t& right_primer_offset, const size_t& right_primer_length); + void add_primer_pair(const string& path_name, const size_t& left_primer_starting_node_id, + const size_t& left_primer_offset, const size_t& left_primer_length, + const size_t& right_primer_starting_node_id, + const size_t& right_primer_offset, const size_t& right_primer_length); /** * Read the path to the primer3 output. Primers information is parsed, @@ -103,7 +104,7 @@ class PrimerFinder { /** * return vector of Primer pairs */ - const vector& get_primer_pairs(const string chrom) const; + const vector& get_primer_pairs_of_chrom(const string chrom_name) const; /** * return the total number of reference paths @@ -129,8 +130,8 @@ class PrimerFinder { * and the length of primer. * Used in: add_primer_pair */ - // void make_primer(Primer& primer, const size_t& starting_node_id, - // const size_t& offset, const size_t& length, const bool& is_left); + void make_primer(Primer& primer, const string& path_name, const size_t& starting_node_id, + const size_t& offset, const size_t& length, const bool& is_left); /** * Find and store corresponding node ids to Primer object. @@ -158,8 +159,10 @@ class PrimerFinder { * Used in: add_primer_node * load_primers */ - const bool no_variation(const PrimerPair& primer_pair) const; + // const bool no_variation_at_primers(const PrimerPair& primer_pair) const; + void update_variation(PrimerPair& primer_pair, const string& path_name); + /** * Split a string into vectors. * Used in: load_primers diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 229aa8cdd47..126776840aa 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -21,7 +21,8 @@ void help_primers(char** argv) { << " -z, --zero-variance allow no variance in the product" << endl << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl - << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl; + << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl + << " -a, --all-primers output all primers" << endl; } int main_primers(int argc, char** argv) { @@ -44,19 +45,18 @@ int main_primers(int argc, char** argv) { while (true) { static struct option long_options[] = { - {"help", no_argument, 0, 'h'}, - {"xg-path", required_argument, 0, 'x'}, - {"snarl-index", required_argument, 0, 's'}, + {"help", no_argument, 0, 'h'}, + {"xg-path", required_argument, 0, 'x'}, + {"snarl-index", required_argument, 0, 's'}, {"zero-variance", required_argument, 0, 'z'}, - {"tolerance", required_argument, 0, 'l'}, - {"minimum-size", required_argument, 0, 'n'}, - {"maximum-size", required_argument, 0, 'm'}, + {"tolerance", required_argument, 0, 'l'}, + {"minimum-size", required_argument, 0, 'n'}, + {"maximum-size", required_argument, 0, 'm'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:s:zl:n:m:", - long_options, &option_index); + c = getopt_long (argc, argv, "hx:s:zl:n:m:", long_options, &option_index); // Detect the end of the options. if (c == -1) break; @@ -108,11 +108,20 @@ int main_primers(int argc, char** argv) { exit(1); } - string file_name = get_input_file_name(optind, argc, argv); + string primers_path = get_input_file_name(optind, argc, argv); - cout << "primer file name: " << file_name << endl + cout << "primer file name: " << primers_path << endl << "xg file name: " << xg_path << endl << "snarl index file name: " << snarl_index_path << endl; + + SnarlDistanceIndex distance_index; + unique_ptr graph; + distance_index.deserialize(snarl_index_path); + graph = vg::io::VPKG::load_one(xg_path); + PrimerFinder primer_finder(graph, &distance_index); + primer_finder.load_primers(primers_path); + + return 0; } diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 85299d92c19..f9d0cd4591b 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -37,76 +37,124 @@ using namespace std; unique_ptr graph; string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer_tabular.out"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); - PrimerFinder primer_finder(graph, "y", &distance_index); + PrimerFinder primer_finder(graph, &distance_index); + primer_finder.load_primers(primers_path); - + SECTION("Loads the correct number of chromosomes") { + REQUIRE(primer_finder.total_reference_paths() == 1); + } + SECTION("Loads the correct number of primer pairs") { - REQUIRE(primer_finder.get_primer_pairs().size() == 5); + REQUIRE(primer_finder.get_primer_pairs_of_chrom("y").size() == 5); } SECTION("Loads and processes the primers correctly") { - primer_finder.add_primer_pair(9, 14, 20, 22, 0, 20); - const vector& primer_pairs = primer_finder.get_primer_pairs(); - const vector& selected_primer_pairs = primer_finder.get_selected_primer_pairs(); - const PrimerPair& pair_0 = primer_pairs[0]; // 1st set of primers read from primer3 output. No variation in either primers. - const PrimerPair& pair_5 = primer_pairs[5]; // made up set of primers. Variation in both priemrs. - - SECTION("Check for basic primer attributes") { - REQUIRE(pair_0.left_primer.sequence == "TGCCTGGCATAGAGGAAAGC"); - REQUIRE(pair_0.left_primer.position == 362); - REQUIRE(pair_0.left_primer.length == 20); - REQUIRE(pair_0.right_primer.sequence == "GCCAGAAGAGCCTCAAGGAG"); - REQUIRE(pair_0.right_primer.position == 466); - REQUIRE(pair_0.right_primer.length == 20); - REQUIRE(pair_5.left_primer.sequence == "AGCCAGACAAATCTGGGTTC"); - REQUIRE(pair_5.left_primer.position == 181); - REQUIRE(pair_5.left_primer.length == 20); - REQUIRE(pair_5.right_primer.sequence == "AGATAATTAAACTGAAGTTC"); - REQUIRE(pair_5.right_primer.position == 260); - REQUIRE(pair_5.right_primer.length == 20); - } + primer_finder.add_primer_pair("y", 9, 14, 20, 22, 0, 20); // made up data, variation both at primers and in product + primer_finder.add_primer_pair("y", 31, 0, 15, 34, 1, 15); // made up data, no variation at primers or in product + + // Correct primer attributes + const vector left_primers_sequences { + "TGCCTGGCATAGAGGAAAGC", "GAGTCGAGGCTCAAGGACAG", "CAGAGTCGAGGCTCAAGGAC", + "GAGGCTCAAGGACAGCTCTC", "TCCAGAAGCTGCTCTTTCCC", "AGCCAGACAAATCTGGGTTC", + "CAACTGGTAGTTACT" + }; + + const vector left_primers_positions { + 362, 620, 618, 625, 819, 181, 388 + }; + + const vector left_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector left_primers_nodes_count { + 2, 1, 1, 2, 2, 6, 1 + }; + + const vector right_primers_sequences { + "GCCAGAAGAGCCTCAAGGAG", "AGGAGAGCTGGGAAAAGGGA", "AGGAGAGCTGGGAAAAGGGA", + "AGGAGAGCTGGGAAAAGGGA", "GCCTGGGTAGCTTTGGATGT", "AGATAATTAAACTGAAGTTC", + "GTTGACAATGAAAAG" + }; - SECTION("Check for minimum and maximum distance") { - REQUIRE(pair_0.linear_product_size == 124); - REQUIRE(pair_0.min_product_size == 124); - REQUIRE(pair_0.max_product_size == 124); - REQUIRE(pair_5.linear_product_size == 99); - REQUIRE(pair_5.min_product_size == 97); - REQUIRE(pair_5.max_product_size == 100); + const vector right_primers_positions { + 466, 745, 745, 745, 935, 260, 485 + }; + + const vector right_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector right_primers_nodes_count { + 2, 1, 1, 1, 2, 3, 1 + }; + + const vector min_product_sizes { + 124, 142, 144, 137, 135, 97, 112 + }; + + const vector max_product_sizes { + 124, 145, 147, 140, 138, 100, 112 + }; + + const vector linear_product_sizes { + 124, 145, 147, 140, 136, 99, 112 + }; + + const vector no_variations_at_primers { + true, true, true, true, true, false, true + }; + + const vector no_variations_in_products { + false, false, false, false, false, false, true + }; + + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); + + REQUIRE(primer_pairs.size() == left_primers_sequences.size()); + for (size_t i = 0; i < primer_pairs.size(); ++i) { + REQUIRE(left_primers_nodes_count[i] == primer_pairs[i].left_primer.mapped_nodes_ids.size()); + REQUIRE(left_primers_sequences[i] == primer_pairs[i].left_primer.sequence); + REQUIRE(left_primers_positions[i] == primer_pairs[i].left_primer.position); + REQUIRE(left_primers_lengths[i] == primer_pairs[i].left_primer.length); + REQUIRE(right_primers_nodes_count[i] == primer_pairs[i].right_primer.mapped_nodes_ids.size()); + REQUIRE(right_primers_sequences[i] == primer_pairs[i].right_primer.sequence); + REQUIRE(right_primers_positions[i] == primer_pairs[i].right_primer.position); + REQUIRE(right_primers_lengths[i] == primer_pairs[i].right_primer.length); + REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); + REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); + REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); + REQUIRE(no_variations_at_primers[i] == primer_pairs[i].no_variation_at_primers); + REQUIRE(no_variations_in_products[i] == primer_pairs[i].no_variation_in_products); } - SECTION("Check that primers are mapped to correct nodes") { - vector pair_0_left_primer_nodes {27, 8}; - for (size_t i = 0; i < pair_0.left_primer.mapped_nodes_ids.size()-1; i++) { - REQUIRE(pair_0.left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); + SECTION("Check that primers are assigned with correct nodes") { + vector pair_0_left_primer_nodes {27, 28}; + for (size_t i = 0; i < primer_pairs[0].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); } vector pair_0_right_primer_nodes {33, 34}; - for (size_t i = 0; i < pair_0.right_primer.mapped_nodes_ids.size()-1; i++) { - REQUIRE(pair_0.right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); + for (size_t i = 0; i < primer_pairs[0].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); } vector pair_5_left_primer_nodes {9, 11, 12, 14, 15, 17}; - for (size_t i = 0; i < pair_5.left_primer.mapped_nodes_ids.size()-1; i++) { - REQUIRE(pair_5.left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); + for (size_t i = 0; i < primer_pairs[5].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); } vector pair_5_right_primer_nodes {22, 24, 25}; - for (size_t i = 0; i < pair_5.right_primer.mapped_nodes_ids.size()-1; i++) { - REQUIRE(pair_5.right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); + for (size_t i = 0; i < primer_pairs[5].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); } } - SECTION("Check for variation at primer sites") { - REQUIRE(primer_pairs.size() == 6); - REQUIRE(selected_primer_pairs.size() == 5); - } - } } } -} \ No newline at end of file +} \ No newline at end of file From 3aa772fb9812336f44712ccb91ea41ace2ac49e2 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Wed, 18 Oct 2023 18:03:18 -0700 Subject: [PATCH 16/43] update --- src/subcommand/primers_main.cpp | 53 ++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 126776840aa..32007840c3f 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -25,6 +25,21 @@ void help_primers(char** argv) { << " -a, --all-primers output all primers" << endl; } +void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { + const Primer& left_primer = primer_pair.left_primer; + const Primer& right_primer = primer_pair.right_primer; + cout << genome_name << "\t"; + cout << left_primer.sequence << "\t" << right_primer.sequence << "\t" + << left_primer.position << "\t" << right_primer.position << "\t" + << left_primer.length << "\t" << right_primer.length << "\t" + << primer_pair.linear_product_size << "\t" + << primer_pair.min_product_size << "\t" + << primer_pair.max_product_size << "\t" + << primer_pair.no_variation_at_primers << "\t" + << primer_pair.no_variation_in_products << endl; + +} + int main_primers(int argc, char** argv) { if (argc == 2) { @@ -35,9 +50,10 @@ int main_primers(int argc, char** argv) { string xg_path; string snarl_index_path; bool zero_variance = false; - int tolerance = 20; - int minimum_product_size = numeric_limits::max(); - int maximum_product_size = numeric_limits::max(); + bool all_primers = false; + int tolerance = 10; + int minimum_product_size = numeric_limits::max(); + int maximum_product_size = numeric_limits::max(); int c; optind = 2; @@ -52,11 +68,12 @@ int main_primers(int argc, char** argv) { {"tolerance", required_argument, 0, 'l'}, {"minimum-size", required_argument, 0, 'n'}, {"maximum-size", required_argument, 0, 'm'}, + {"all-primers", required_argument, 0, 'a'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:s:zl:n:m:", long_options, &option_index); + c = getopt_long (argc, argv, "hx:s:zl:n:m:a", long_options, &option_index); // Detect the end of the options. if (c == -1) break; @@ -86,6 +103,10 @@ int main_primers(int argc, char** argv) { case 'm': maximum_product_size = parse(optarg); break; + + case 'a': + all_primers = true; + break; case 'h': case '?': @@ -110,9 +131,9 @@ int main_primers(int argc, char** argv) { string primers_path = get_input_file_name(optind, argc, argv); - cout << "primer file name: " << primers_path << endl - << "xg file name: " << xg_path << endl - << "snarl index file name: " << snarl_index_path << endl; + // cout << "primer file name: " << primers_path << endl + // << "xg file name: " << xg_path << endl + // << "snarl index file name: " << snarl_index_path << endl; SnarlDistanceIndex distance_index; unique_ptr graph; @@ -121,7 +142,23 @@ int main_primers(int argc, char** argv) { PrimerFinder primer_finder(graph, &distance_index); primer_finder.load_primers(primers_path); - + vector reference_paths = primer_finder.get_reference_paths(); + for (size_t i = 0; i < reference_paths.size(); ++i) { + string path_name = reference_paths[i]; + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom(path_name); + for (size_t j = 0; j < primer_pairs.size(); ++j) { + const PrimerPair& primer_pair = primer_pairs[j]; + if (all_primers) { + print_tabular(path_name, primer_pair); + } else if (zero_variance) { + if (primer_pair.no_variation_in_products) { + print_tabular(path_name, primer_pair); + } + } else if (primer_pair.no_variation_at_primers) { + print_tabular(path_name, primer_pair); + } + } + } return 0; } From 94cca8b0c407e1c2695fec7a58ddea3e17848056 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Thu, 19 Oct 2023 21:13:02 -0700 Subject: [PATCH 17/43] update --- src/primer_filter.cpp | 9 ++++---- src/primer_filter.hpp | 7 +++--- src/subcommand/primers_main.cpp | 41 +++++++++++++++++++++++++-------- src/unittest/primer_filter.cpp | 4 ++-- 4 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index cddfefc16cd..3098ab297ef 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -7,9 +7,10 @@ using namespace std; // Constructor PrimerFinder::PrimerFinder(const unique_ptr& graph_param, - const SnarlDistanceIndex* distance_index_param) { + const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle) { graph = graph_param.get(); distance_index = distance_index_param; + load_primers(primers_file_handle); } // Destructor @@ -17,7 +18,7 @@ PrimerFinder::~PrimerFinder() { // nothing to do } -const vector& PrimerFinder::get_primer_pairs_of_chrom(const string chrom_name) const { +const vector& PrimerFinder::get_primer_pairs_of_chrom(const string& chrom_name) const { return chroms.at(chrom_name); } @@ -39,9 +40,9 @@ void PrimerFinder::add_primer_pair(const string& path_name, update_variation(primer_pair, path_name); } -void PrimerFinder::load_primers(const string& path_to_primers) { +void PrimerFinder::load_primers(ifstream& file_handle) { - ifstream file_handle(path_to_primers); + //ifstream file_handle(path_to_primers); assert(file_handle.is_open()); vector cur_fields; diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index 39e0be2d13f..b49c10e5ddd 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -66,7 +66,6 @@ class PrimerFinder { map> chroms; // map containing a vector of primer pairs for each chromosome const PathPositionHandleGraph* graph; const SnarlDistanceIndex* distance_index; - path_handle_t reference_path_handle; public: PrimerFinder() = default; @@ -76,7 +75,7 @@ class PrimerFinder { * and pointer to SnarlDistanceIndex */ PrimerFinder(const unique_ptr& graph_param, - const SnarlDistanceIndex* distance_index_param); + const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle); /** * Destructor @@ -99,12 +98,12 @@ class PrimerFinder { * processed, and stored in primer_pairs vector - and selected_primer_pairs * if conditions are met. */ - void load_primers(const string& path_to_primers); + void load_primers(ifstream& file_handle); /** * return vector of Primer pairs */ - const vector& get_primer_pairs_of_chrom(const string chrom_name) const; + const vector& get_primer_pairs_of_chrom(const string& chrom_name) const; /** * return the total number of reference paths diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 32007840c3f..f9226e61985 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -18,13 +18,25 @@ void help_primers(char** argv) { << "options:" << endl << " -x, --xg-path FILE use this xg graph" << endl << " -s, --snarl-index FILE use this snarl index" << endl - << " -z, --zero-variance allow no variance in the product" << endl + << " -z, --zero-variation allow no variance in the product" << endl << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl << " -a, --all-primers output all primers" << endl; } +size_t difference(const size_t& a, const size_t& b) { + size_t diff; + if (a == b) { + diff = 0; + } else if (a > b) { + diff = a - b; + } else { + diff = b - a; + } + return diff; +} + void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { const Primer& left_primer = primer_pair.left_primer; const Primer& right_primer = primer_pair.right_primer; @@ -49,7 +61,7 @@ int main_primers(int argc, char** argv) { string xg_path; string snarl_index_path; - bool zero_variance = false; + bool zero_variation = false; bool all_primers = false; int tolerance = 10; int minimum_product_size = numeric_limits::max(); @@ -89,7 +101,7 @@ int main_primers(int argc, char** argv) { break; case 'z': - zero_variance = true; + zero_variation = true; break; case 'l': @@ -130,17 +142,13 @@ int main_primers(int argc, char** argv) { } string primers_path = get_input_file_name(optind, argc, argv); - - // cout << "primer file name: " << primers_path << endl - // << "xg file name: " << xg_path << endl - // << "snarl index file name: " << snarl_index_path << endl; SnarlDistanceIndex distance_index; unique_ptr graph; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_path); - PrimerFinder primer_finder(graph, &distance_index); - primer_finder.load_primers(primers_path); + ifstream file_handle(primers_path); + PrimerFinder primer_finder(graph, &distance_index, file_handle); vector reference_paths = primer_finder.get_reference_paths(); for (size_t i = 0; i < reference_paths.size(); ++i) { @@ -150,11 +158,24 @@ int main_primers(int argc, char** argv) { const PrimerPair& primer_pair = primer_pairs[j]; if (all_primers) { print_tabular(path_name, primer_pair); - } else if (zero_variance) { + } else if (zero_variation) { if (primer_pair.no_variation_in_products) { print_tabular(path_name, primer_pair); } } else if (primer_pair.no_variation_at_primers) { + if (minimum_product_size != numeric_limits::max() && + primer_pair.min_product_size < minimum_product_size) { + continue; + } + if (maximum_product_size != numeric_limits::max() && + primer_pair.max_product_size > maximum_product_size) { + continue; + } + if (difference(primer_pair.linear_product_size, primer_pair.min_product_size) > tolerance + || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { + continue; + } + print_tabular(path_name, primer_pair); } } diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index f9d0cd4591b..d311a933add 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -40,9 +40,9 @@ using namespace std; string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer_tabular.out"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); - PrimerFinder primer_finder(graph, &distance_index); + ifstream file_handle(primers_path); + PrimerFinder primer_finder(graph, &distance_index, file_handle); - primer_finder.load_primers(primers_path); SECTION("Loads the correct number of chromosomes") { REQUIRE(primer_finder.total_reference_paths() == 1); } From 3e643826004118d9d4d44e3ed4ba67d93574682a Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Thu, 19 Oct 2023 21:44:13 -0700 Subject: [PATCH 18/43] rm --- src/unittest/primers.cpp | 498 --------------------------------------- 1 file changed, 498 deletions(-) delete mode 100644 src/unittest/primers.cpp diff --git a/src/unittest/primers.cpp b/src/unittest/primers.cpp deleted file mode 100644 index 1ebe15088a1..00000000000 --- a/src/unittest/primers.cpp +++ /dev/null @@ -1,498 +0,0 @@ -// -// primers.cpp -// -// Unit tests for primer filter -// - -#include -#include -#include -#include -#include -#include -#include -#include "vg/io/json2pb.h" -#include -#include "catch.hpp" -#include "random_graph.hpp" -#include "randomness.hpp" -#include "../snarl_distance_index.hpp" -#include "../integrated_snarl_finder.hpp" -#include "../genotypekit.hpp" -#include "../traversal_finder.hpp" -#include -#include -#include "xg.hpp" - -namespace vg { - namespace unittest { - - TEST_CASE( "temporary place to write code in for primers", - "[primers]" ) { - - struct Primer { - string sequence; - bool left = true; - size_t position; - size_t length; - size_t offset; - vector mapped_nodes_ids; - }; - - struct Primer_pair { - Primer left_primer; - Primer right_primer; - size_t linear_product_size; - size_t min_product_size; - size_t max_product_size; - }; - - class Primer_finder { - private: - vector primer_pairs; - PathPositionHandleGraph* graph; - SnarlDistanceIndex* distance_index; - path_handle_t reference_path_handle; - vector selected_primer_pairs; - - public: - Primer_finder() = default; - Primer_finder( - //unique_ptr& graph_param, - unique_ptr& graph_param, - string reference_path_name, - SnarlDistanceIndex* distance_index_param) { - graph = graph_param.get(); - reference_path_handle = graph->get_path_handle("y"); - distance_index = distance_index_param; - } - ~Primer_finder() = default; - - - void make_primer(Primer& primer, size_t starting_node_id, size_t offset, size_t length, bool is_left) { - if (is_left) { - primer.left = true; - } else { - primer.left = false; - } - primer.length = length; - string sequence = ""; - handle_t cur_handle = graph->get_handle(starting_node_id); - step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; - primer.position = graph->get_position_of_step(cur_step_handle) + offset; - if (graph->get_length(cur_handle) - offset > length) { - sequence += graph->get_sequence(cur_handle).substr(offset, length); - } else { - sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); - while (sequence.size() < length) { - cur_step_handle = graph->get_next_step(cur_step_handle); - cur_handle = graph->get_handle_of_step(cur_step_handle); - sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); - } - } - - if (is_left) { - primer.sequence = sequence; - } else { - primer.sequence = revcomp(sequence); - } - map_to_nodes(primer); - } - - // starting position relative to the starting node - Primer_pair add_primer_pair(size_t left_primer_starting_node_id, - size_t left_primer_offset, size_t left_primer_length, - size_t right_primer_starting_node_id, - size_t right_primer_offset, size_t right_primer_length) - { - Primer left_primer; - Primer right_primer; - make_primer(left_primer, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); - make_primer(right_primer, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); - Primer_pair new_primer_pair {left_primer, right_primer, - right_primer.position - left_primer.position + right_primer.length}; - primer_pairs.push_back(new_primer_pair); - if (no_variation(new_primer_pair)) { - selected_primer_pairs.push_back(new_primer_pair); - } - - return new_primer_pair; - } - - - void load_primers(string path_to_primers) { - regex left_seq_pattern("PRIMER_LEFT_\\d+_SEQUENCE=(\\w+)"); - regex right_seq_pattern("PRIMER_RIGHT_\\d+_SEQUENCE=(\\w+)"); - regex left_pos_pattern("PRIMER_LEFT_\\d+=(\\d+,\\d+)"); - regex right_pos_pattern("PRIMER_RIGHT_\\d+=(\\d+,\\d+)"); - - Primer left_primer {""}; - Primer right_primer {"", false}; - - ifstream file_handle(path_to_primers); - if (file_handle.is_open()) { - string line; - while (getline(file_handle, line)) { - line = rstrip(line); - smatch match; - if (regex_search(line, match, left_seq_pattern)) { - if (right_primer.sequence != "") { - map_to_nodes(left_primer); - map_to_nodes(right_primer); - Primer_pair primer_pair {left_primer, right_primer, - right_primer.position - left_primer.position + right_primer.length}; - primer_pairs.push_back(primer_pair); - if (no_variation(primer_pair)) { - selected_primer_pairs.push_back(primer_pair); - } - left_primer = {""}; - right_primer = {"", false}; - } - left_primer.sequence = match[1]; - } else if (regex_search(line, match, right_seq_pattern)) { - right_primer.sequence = match[1]; - } else if (regex_search(line, match, left_pos_pattern)) { - vector pos_and_len = split(match[1], ","); - left_primer.position = stoi(pos_and_len[0]); - left_primer.length = stoi(pos_and_len[1]); - } else if (regex_search(line, match, right_pos_pattern)) { - vector pos_and_len = split(match[1], ","); - right_primer.length = stoi(pos_and_len[1]); - right_primer.position = stoi(pos_and_len[0]) - right_primer.length + 1; - - } - } - map_to_nodes(left_primer); - map_to_nodes(right_primer); - Primer_pair primer_pair {left_primer, right_primer, - right_primer.position - left_primer.position + right_primer.length}; - if (no_variation(primer_pair)) { - selected_primer_pairs.push_back(primer_pair); - } - primer_pairs.push_back(primer_pair); - } - } - - - void map_to_nodes(Primer& primer) { - string primer_seq; - if (primer.left) { - primer_seq = primer.sequence; - } else { - primer_seq = revcomp(primer.sequence); - } - step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); - handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); - primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); - string cur_node_sequence = graph->get_sequence(cur_node_handle); - size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq, true) - 1; - while (primer_matched_index < primer_seq.size()-1) { - cur_node_step_handle = graph->get_next_step(cur_node_step_handle); - cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); - primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); - cur_node_sequence = graph->get_sequence(cur_node_handle); - string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); - primer_matched_index += longest_match_len(primer, primer_substr, cur_node_sequence, false); - } - } - - void run_test() { - cout << "testing HandleGraph..." << endl; - nid_t min_node_id = graph->min_node_id(); - nid_t max_node_id = graph->max_node_id(); - handle_t min_node = graph->get_handle(min_node_id); - handle_t max_node = graph->get_handle(max_node_id); - cout << "min node id: " << min_node_id << endl; - cout << "sequence: " << graph->get_sequence(min_node) << endl; - cout << "max node id: " << max_node_id << endl; - cout << "sequence: " << graph->get_sequence(max_node) << endl; - cout << "HandleGraph works! :)" << endl; - - cout << "-------------------------------------" << endl; - - cout << "testing PathHandleGraph..." << endl; - cout << graph->get_path_count() << endl; - cout << "path with name y exists: " << graph->has_path("y") << endl; - cout << "reference path has " << graph->get_step_count(reference_path_handle) << - " node steps" << endl; - cout << "PathHandleGraph works! :)" << endl; - - cout << "-------------------------------------" << endl; - - cout << "testing PathPositionHandleGraph..." << endl; - cout << "referecne path length: " << graph->get_path_length(reference_path_handle) << endl; - step_handle_t step_handle_lprimer = graph->get_step_at_position(reference_path_handle, 362); // 362 is the position of the left primer of the first primer pair - handle_t handle_lprimer = graph->get_handle_of_step(step_handle_lprimer); - cout << "left primer sequence begin node seq: " << graph->get_sequence(handle_lprimer) << endl; - step_handle_t step_handle_lprimer_next = graph->get_next_step(step_handle_lprimer); - handle_t handle_lprimer_next = graph->get_handle_of_step(step_handle_lprimer_next); - cout << "left priemr sequence second node seq: " << graph->get_sequence(handle_lprimer_next) << endl; - - // Get node id for position 0 - step_handle_t position_0_step_handle = graph->get_step_at_position(reference_path_handle, 0); - handle_t position_0_handle = graph->get_handle_of_step(position_0_step_handle); - cout << "node id for position 0 " << graph->get_id(position_0_handle) << endl; - // Get node id for position 1 - step_handle_t position_1_step_handle = graph->get_step_at_position(reference_path_handle, 1); - handle_t position_1_handle = graph->get_handle_of_step(position_1_step_handle); - cout << "node id for position 1 " << graph->get_id(position_1_handle) << endl; - - // Get node id for position 31 - step_handle_t position_31_step_handle = graph->get_step_at_position(reference_path_handle, 31); - handle_t position_31_handle = graph->get_handle_of_step(position_31_step_handle); - cout << "node id for position 31 " << graph->get_id(position_31_handle) << endl; - - // Get node id for position 32 - step_handle_t position_32_step_handle = graph->get_step_at_position(reference_path_handle, 32); - handle_t position_32_handle = graph->get_handle_of_step(position_32_step_handle); - cout << "node id for position 32 " << graph->get_id(position_32_handle) << endl; - - cout << "PathPositionHandleGraph works! :)" << endl; - - cout << "-------------------------------------" << endl; - - cout << "testing SnarlDistanceIndex..." << endl; - net_handle_t root_node = distance_index->get_root(); - cout << "is root a root? " << distance_index->is_root(root_node) << endl; - cout << "is root a node? " << distance_index->is_node(root_node) << endl; - cout << "is root a snarl? " << distance_index->is_snarl(root_node) << endl; - cout << "is root a chain? " << distance_index->is_chain(root_node) << endl; - cout << "depth of root is: " << distance_index->get_depth(root_node) << endl; - - net_handle_t min_node_net_handle = distance_index->get_net(min_node, graph); - cout << "depth of min node is: " << distance_index->get_depth(min_node_net_handle) << endl; - cout << "make sure that min node net handle is a node: " << distance_index->is_node(min_node_net_handle) << endl; - - size_t min_dist_12_17 = distance_index->minimum_distance(12, false, 1, 17, false, 2); - size_t max_dist_12_17 = distance_index->maximum_distance(12, false, 3, 17, false, 3); - - cout << "min dist between node 12 and node 17: " << min_dist_12_17 << endl; - cout << "max dist between node 12 and node 17: " << max_dist_12_17 << endl; - - cout << "SnarlDistanceIndex works! :)" << endl; - cout << "-------------------------------------" << endl; - - cout << "Testing PathPositionHandleGraph..." << endl; - string left_primer_sequence; - handle_t starting_handle = graph->get_handle(3); - size_t left_primer_offset = 1; - size_t left_primer_length = 10; - step_handle_t starting_step_handle = graph->steps_of_handle(starting_handle)[0]; - left_primer_sequence += graph->get_sequence(starting_handle).substr(left_primer_offset, - graph->get_length(starting_handle) - left_primer_offset); - cout << left_primer_sequence << endl; - - - - - cout << "-------------------------------------" << endl; - - cout << "testing load_primers..." << endl; - add_primer_pair(3, 1, 10, 8, 3, 15); - add_primer_pair(9, 14, 20, 22, 0, 20); - for (vector::iterator it = primer_pairs.begin(); it != primer_pairs.end(); ++it) { - Primer left_primer = it->left_primer; - Primer right_primer = it->right_primer; - cout << "offsets: " << left_primer.offset << " " << right_primer.offset << endl; - cout << "product size: " << it->linear_product_size << endl; - - size_t min_product_size = distance_index->minimum_distance(left_primer.mapped_nodes_ids[0], - false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], - false, right_primer.offset); - cout << "minimum product size: " << min_product_size << endl; - - size_t max_product_size = distance_index->maximum_distance(left_primer.mapped_nodes_ids[0], - false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], - false, right_primer.offset); - cout << "maximum product size: " << max_product_size << endl; - - cout << left_primer.left << " " << left_primer.position << " " << - left_primer.length << " " << left_primer.sequence << endl; - for (int i = 0; i < left_primer.mapped_nodes_ids.size(); i++) { - size_t cur_node_id = left_primer.mapped_nodes_ids[i]; - handle_t cur_node_handle = graph->get_handle(cur_node_id); - cout << graph->get_sequence(cur_node_handle) << " "; - } - cout << endl; - cout << right_primer.left << " " << right_primer.position << " " << - right_primer.length << " " << right_primer.sequence << " " << revcomp(right_primer.sequence) << endl; - for (int i = 0; i < right_primer.mapped_nodes_ids.size(); i++) { - size_t cur_node_id = right_primer.mapped_nodes_ids[i]; - handle_t cur_node_handle = graph->get_handle(cur_node_id); - cout << graph->get_sequence(cur_node_handle) << " "; - } - cout << endl; - cout << endl; - } - - for (vector::iterator it = selected_primer_pairs.begin(); it != selected_primer_pairs.end(); ++it) { - Primer left_primer = it->left_primer; - Primer right_primer = it->right_primer; - cout << left_primer.sequence << " - " << right_primer.sequence << endl; - } - - cout << endl; - cout << "load_primers works! :)" << endl; - cout << "-------------------------------------" << endl; - - } - // void filter_primer() { - - //} - - private: - // Functions only used in load_primers().. Not sure where to put them for now - string rstrip(string const s) { - const string WHITESPACE = " \n\r\t\f\v"; - size_t end = s.find_last_not_of(WHITESPACE); - if (end == string::npos) { - return ""; - } - return s.substr(0, end+1); - } - - bool no_variation(const Primer_pair& primer_pair) { - Primer left_primer = primer_pair.left_primer; - Primer right_primer = primer_pair.right_primer; - for (vector::iterator node_id = left_primer.mapped_nodes_ids.begin(); node_id != left_primer.mapped_nodes_ids.end(); ++node_id) { - handle_t cur_handle = graph->get_handle(*node_id); - net_handle_t cur_net_handle = distance_index->get_net(cur_handle, graph); - size_t depth = distance_index->get_depth(cur_net_handle); - if (depth != 1) { - return false; - } - } - return true; - } - size_t longest_match_len(Primer& primer, string const left_seq, string const right_seq, bool const first_node) { - size_t llen = left_seq.size(), rlen = right_seq.size(); - size_t length = min(llen, rlen); - size_t longest_match = 0; - - // Change .. can be done in one for loop - if (first_node && llen >= rlen) { - for (size_t i = 0; i <= llen - rlen; i++) { - if (left_seq.substr(i, rlen) == right_seq) { - longest_match = rlen; - if (primer.left) { - primer.offset = i; - } else { - primer.offset = i + primer.sequence.size(); - } - return longest_match; - } - } - } - - if (first_node) { - for (size_t i = 1; i <= length; i++) { - if (left_seq.substr(llen - i, i) == right_seq.substr(0, i)) { - longest_match = i; - if (primer.left && first_node) { - primer.offset = llen - i; - } else if (!primer.left) { - primer.offset = i; - } - } - } - } else { - for (size_t i = 1; i <= length; i++) { - if (left_seq.substr(0, i) == right_seq.substr(0, i)) { - longest_match = i; - if (!primer.left) { - primer.offset = i; - } - } - } - } - - return longest_match; - } - - char complement(char nt) { - switch(nt) { - case 'A': return 'T'; - case 'C': return 'G'; - case 'G': return 'C'; - case 'T': return 'A'; - } - return 'N'; - } - - string revcomp(string const seq) { - string revcomp_seq; - for (int i = seq.size()-1; i >= 0; i--) { - revcomp_seq += complement(seq[i]); - } - return revcomp_seq; - } - - vector split(string str, string const delim) { - size_t cur_pos = 0; - string word; - vector word_list; - while ((cur_pos = str.find(delim)) != string::npos) { - word = str.substr(0, cur_pos); - word_list.push_back(word); - str.erase(0, cur_pos + delim.length()); - } - word = str; - word_list.push_back(word); - return word_list; - } - }; - - SnarlDistanceIndex distance_index; - unique_ptr graph; - string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; - string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3.out"; - distance_index.deserialize(snarl_index_path); - graph = vg::io::VPKG::load_one(xg_graph_path); - - //Primer_finder primer_finder; - Primer_finder primer_finder(graph, "y", &distance_index); - primer_finder.load_primers(primers_path); - primer_finder.run_test(); - - - - - // SnarlDistanceIndex distance_index; - - // string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.dist"; - // string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/tiny/tiny.xg"; - - // distance_index.deserialize(snarl_index_path); - // unique_ptr graph = vg::io::VPKG::load_one(xg_graph_path); - - // net_handle_t node = distance_index.get_node_net_handle(1); - // cout << distance_index.net_handle_as_string(node) << endl; - - // net_handle_t root_node = distance_index.get_root(); - // cout << "root: " << distance_index.net_handle_as_string(root_node) << endl; - - // cout << distance_index.is_root(node) << endl; - // cout << distance_index.is_snarl(node) << endl; - // cout << distance_index.is_chain(node) << endl; - // cout << distance_index.is_node(node) << endl; - // cout << "--------------------------------------------" << endl; - // cout << graph->get_node_count() << endl; - // cout << graph->get_edge_count() << endl; - // nid_t min_node_id = graph->min_node_id(); - // cout << graph->has_node(min_node_id) << endl; - // nid_t max_node_id = graph->max_node_id(); - // cout << min_node_id << endl; - // cout << max_node_id << endl; - // cout << "--------------------------------------------" << endl; - // handle_t min_node = graph->get_handle(min_node_id); - // cout << graph->get_sequence(min_node) << endl; - // handle_t max_node = graph->get_handle(max_node_id); - // cout << graph->get_sequence(max_node) << endl; - - - } - - } - -} \ No newline at end of file From 2f3f518da898cc7c488744164708e93068dc95b9 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Sun, 22 Oct 2023 21:32:59 -0700 Subject: [PATCH 19/43] up --- src/primer_filter.cpp | 70 +++++-- src/primer_filter.hpp | 15 +- src/subcommand/primers_main.cpp | 88 ++++----- src/unittest/primer_filter.cpp | 326 ++++++++++++++++++++++---------- 4 files changed, 329 insertions(+), 170 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 3098ab297ef..ccab3d2dd03 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -28,14 +28,19 @@ void PrimerFinder::add_primer_pair(const string& path_name, const size_t& left_primer_starting_node_id, const size_t& left_primer_offset, const size_t& left_primer_length, const size_t& right_primer_starting_node_id, const size_t& right_primer_offset, const size_t& right_primer_length) { - + chroms.at(path_name).emplace_back(); PrimerPair& primer_pair = chroms.at(path_name).back(); + primer_pair.chromosome_name = path_name; + primer_pair.template_position = 0; primer_pair.right_primer.left = false; - make_primer(primer_pair.left_primer, path_name, left_primer_starting_node_id, left_primer_offset, left_primer_length, true); - make_primer(primer_pair.right_primer, path_name, right_primer_starting_node_id, right_primer_offset, right_primer_length, false); - primer_pair.linear_product_size = primer_pair.right_primer.position - primer_pair.left_primer.position + primer_pair.right_primer.length; + make_primer(primer_pair.left_primer, path_name, left_primer_starting_node_id, + left_primer_offset, left_primer_length, true); + make_primer(primer_pair.right_primer, path_name, right_primer_starting_node_id, + right_primer_offset, right_primer_length, false); + primer_pair.linear_product_size = primer_pair.right_primer.position_template + - primer_pair.left_primer.position_template + primer_pair.right_primer.length; update_min_max_product_size(primer_pair); update_variation(primer_pair, path_name); } @@ -46,6 +51,8 @@ void PrimerFinder::load_primers(ifstream& file_handle) { assert(file_handle.is_open()); vector cur_fields; + size_t cur_template_offset; + string cur_template_info; string cur_path; string line; while (getline(file_handle, line)) { @@ -58,31 +65,40 @@ void PrimerFinder::load_primers(ifstream& file_handle) { assert(chroms[cur_path].back().right_primer.sequence.empty()); chroms[cur_path].pop_back(); } - cur_fields = move(split(line)); - cur_path = cur_fields[cur_fields.size()-1]; + cur_fields = move(split(line)); + cur_template_info = cur_fields[cur_fields.size()-1]; + cur_fields = move(split(cur_template_info,',')); + cur_template_offset = stoi(cur_fields[1]); + cur_path = cur_fields[0]; chroms[cur_path].emplace_back(); + chroms[cur_path].back().chromosome_name = cur_path; + chroms[cur_path].back().template_position = cur_template_offset; chroms[cur_path].back().right_primer.left = false; } else if (left_primer_line_start != string::npos) { cur_fields = move(split(line.substr(left_primer_line_start, line.size()))); PrimerPair& primer_pair = chroms[cur_path].back(); - primer_pair.left_primer.sequence = cur_fields[9]; - primer_pair.left_primer.position = stoi(cur_fields[2]); - primer_pair.left_primer.length = stoi(cur_fields[3]); + primer_pair.left_primer.position_chromosome = stoi(cur_fields[2]) + cur_template_offset; + primer_pair.left_primer.position_template = stoi(cur_fields[2]); + primer_pair.left_primer.sequence = cur_fields[9]; + primer_pair.left_primer.length = stoi(cur_fields[3]); } else if (startswith(line, "RIGHT PRIMER")) { cur_fields = move(split(line.substr(right_primer_line_start, line.size()))); PrimerPair& primer_pair = chroms[cur_path].back(); - primer_pair.right_primer.sequence = cur_fields[9]; - primer_pair.right_primer.position = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1; - primer_pair.right_primer.length = stoi(cur_fields[3]); + primer_pair.right_primer.position_chromosome = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1 + cur_template_offset; + primer_pair.right_primer.position_template = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1; + primer_pair.right_primer.sequence = cur_fields[9]; + primer_pair.right_primer.length = stoi(cur_fields[3]); assert(!primer_pair.left_primer.sequence.empty()); map_to_nodes(primer_pair.left_primer, cur_path); map_to_nodes(primer_pair.right_primer, cur_path); - primer_pair.linear_product_size = primer_pair.right_primer.position - - primer_pair.left_primer.position + primer_pair.right_primer.length; + primer_pair.linear_product_size = primer_pair.right_primer.position_template + - primer_pair.left_primer.position_template + primer_pair.right_primer.length; update_min_max_product_size(primer_pair); update_variation(primer_pair, cur_path); chroms[cur_path].emplace_back(); + chroms[cur_path].back().chromosome_name = cur_path; + chroms[cur_path].back().template_position = cur_template_offset; chroms[cur_path].back().right_primer.left = false; } } @@ -115,7 +131,8 @@ void PrimerFinder::make_primer(Primer& primer, const string& path_name, string sequence = ""; handle_t cur_handle = graph->get_handle(starting_node_id); // get the starting node handle step_handle_t cur_step_handle = graph->steps_of_handle(cur_handle)[0]; - primer.position = graph->get_position_of_step(cur_step_handle) + offset; + primer.position_template = graph->get_position_of_step(cur_step_handle) + offset; + primer.position_chromosome = primer.position_template; // Walk down the path and get the sequence of primer if (graph->get_length(cur_handle) - offset > length) { sequence += graph->get_sequence(cur_handle).substr(offset, length); @@ -123,8 +140,8 @@ void PrimerFinder::make_primer(Primer& primer, const string& path_name, sequence += graph->get_sequence(cur_handle).substr(offset, graph->get_length(cur_handle) - offset); while (sequence.size() < length) { cur_step_handle = graph->get_next_step(cur_step_handle); - cur_handle = graph->get_handle_of_step(cur_step_handle); - sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); + cur_handle = graph->get_handle_of_step(cur_step_handle); + sequence += graph->get_sequence(cur_handle).substr(0, min(graph->get_length(cur_handle), length-sequence.size())); } } @@ -158,7 +175,7 @@ void PrimerFinder::map_to_nodes(Primer& primer, const string& path_name) { primer_seq = reverse_complement(primer.sequence); } - step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position); + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position_chromosome); handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); string cur_node_sequence = graph->get_sequence(cur_node_handle); @@ -238,7 +255,7 @@ void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_ } const path_handle_t& reference_path_handle = graph->get_path_handle(path_name); - step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, left_primer.position); + step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, left_primer.position_template); handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); net_handle_t cur_net_handle = distance_index->get_net(cur_node_handle, graph); nid_t cur_node_id = graph->get_id(cur_node_handle); @@ -246,7 +263,7 @@ void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_ size_t depth = distance_index->get_depth(cur_net_handle); if (depth != 1) { if (primer_nodes_set.find(cur_node_id) != primer_nodes_set.end()) { - primer_pair.no_variation_at_primers = false; + primer_pair.no_variation_at_primers = false; primer_pair.no_variation_in_products = false; break; } else { @@ -274,6 +291,19 @@ vector PrimerFinder::split(const string& str) { return fields; } +vector PrimerFinder::split(const string& str, const char& delim) { + istringstream iss(str); + string field; + vector fields; + + while (getline(iss, field, delim)) { + fields.push_back(field); + } + + return fields; +} + + bool PrimerFinder::startswith(const string& str, const string& prefix) { return str.compare(0, prefix.length(), prefix) == 0; } diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index b49c10e5ddd..e6fbc98a4d9 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -37,9 +37,10 @@ namespace vg { struct Primer { string sequence; bool left = true; - size_t position = numeric_limits::max(); - size_t length = numeric_limits::max(); - size_t offset = numeric_limits::max(); + size_t position_chromosome = numeric_limits::max(); + size_t position_template = numeric_limits::max(); + size_t length = numeric_limits::max(); + size_t offset = numeric_limits::max(); vector mapped_nodes_ids; }; @@ -51,7 +52,9 @@ struct Primer { struct PrimerPair { Primer left_primer; Primer right_primer; + string chromosome_name; size_t linear_product_size = numeric_limits::max(); + size_t template_position = numeric_limits::max(); size_t min_product_size = numeric_limits::max(); size_t max_product_size = numeric_limits::max(); bool no_variation_at_primers = true; @@ -167,6 +170,12 @@ class PrimerFinder { * Used in: load_primers */ vector split(const string& str); + + /** + * Split a string into vectors given delimiter. + */ + vector split(const string& str, const char& delim); + /** * Works like str.startswith(prefix) in python * Used in: load_primers diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index f9226e61985..2b23e489b0b 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -37,20 +37,20 @@ size_t difference(const size_t& a, const size_t& b) { return diff; } -void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { - const Primer& left_primer = primer_pair.left_primer; - const Primer& right_primer = primer_pair.right_primer; - cout << genome_name << "\t"; - cout << left_primer.sequence << "\t" << right_primer.sequence << "\t" - << left_primer.position << "\t" << right_primer.position << "\t" - << left_primer.length << "\t" << right_primer.length << "\t" - << primer_pair.linear_product_size << "\t" - << primer_pair.min_product_size << "\t" - << primer_pair.max_product_size << "\t" - << primer_pair.no_variation_at_primers << "\t" - << primer_pair.no_variation_in_products << endl; - -} +// void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { +// const Primer& left_primer = primer_pair.left_primer; +// const Primer& right_primer = primer_pair.right_primer; +// cout << genome_name << "\t"; +// cout << left_primer.sequence << "\t" << right_primer.sequence << "\t" +// << left_primer.template_position << "\t" << right_primer.template_position << "\t" +// << left_primer.length << "\t" << right_primer.length << "\t" +// << primer_pair.linear_product_size << "\t" +// << primer_pair.min_product_size << "\t" +// << primer_pair.max_product_size << "\t" +// << primer_pair.no_variation_at_primers << "\t" +// << primer_pair.no_variation_in_products << endl; + +// } int main_primers(int argc, char** argv) { @@ -150,36 +150,36 @@ int main_primers(int argc, char** argv) { ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle); - vector reference_paths = primer_finder.get_reference_paths(); - for (size_t i = 0; i < reference_paths.size(); ++i) { - string path_name = reference_paths[i]; - const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom(path_name); - for (size_t j = 0; j < primer_pairs.size(); ++j) { - const PrimerPair& primer_pair = primer_pairs[j]; - if (all_primers) { - print_tabular(path_name, primer_pair); - } else if (zero_variation) { - if (primer_pair.no_variation_in_products) { - print_tabular(path_name, primer_pair); - } - } else if (primer_pair.no_variation_at_primers) { - if (minimum_product_size != numeric_limits::max() && - primer_pair.min_product_size < minimum_product_size) { - continue; - } - if (maximum_product_size != numeric_limits::max() && - primer_pair.max_product_size > maximum_product_size) { - continue; - } - if (difference(primer_pair.linear_product_size, primer_pair.min_product_size) > tolerance - || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { - continue; - } - - print_tabular(path_name, primer_pair); - } - } - } + // vector reference_paths = primer_finder.get_reference_paths(); + // for (size_t i = 0; i < reference_paths.size(); ++i) { + // string path_name = reference_paths[i]; + // const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom(path_name); + // for (size_t j = 0; j < primer_pairs.size(); ++j) { + // const PrimerPair& primer_pair = primer_pairs[j]; + // if (all_primers) { + // print_tabular(path_name, primer_pair); + // } else if (zero_variation) { + // if (primer_pair.no_variation_in_products) { + // print_tabular(path_name, primer_pair); + // } + // } else if (primer_pair.no_variation_at_primers) { + // if (minimum_product_size != numeric_limits::max() && + // primer_pair.min_product_size < minimum_product_size) { + // continue; + // } + // if (maximum_product_size != numeric_limits::max() && + // primer_pair.max_product_size > maximum_product_size) { + // continue; + // } + // if (difference(primer_pair.linear_product_size, primer_pair.min_product_size) > tolerance + // || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { + // continue; + // } + + // print_tabular(path_name, primer_pair); + // } + // } + // } return 0; } diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index d311a933add..076ce633dc0 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -37,124 +37,244 @@ using namespace std; unique_ptr graph; string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer_tabular.out"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); - ifstream file_handle(primers_path); - PrimerFinder primer_finder(graph, &distance_index, file_handle); - - SECTION("Loads the correct number of chromosomes") { - REQUIRE(primer_finder.total_reference_paths() == 1); - } + + SECTION("template_position=0") { + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3_with_ref_pos.out"; + ifstream file_handle(primers_path); + PrimerFinder primer_finder(graph, &distance_index, file_handle); - SECTION("Loads the correct number of primer pairs") { - REQUIRE(primer_finder.get_primer_pairs_of_chrom("y").size() == 5); - } + SECTION("Loads the correct number of chromosomes") { + REQUIRE(primer_finder.total_reference_paths() == 1); + } - SECTION("Loads and processes the primers correctly") { - primer_finder.add_primer_pair("y", 9, 14, 20, 22, 0, 20); // made up data, variation both at primers and in product - primer_finder.add_primer_pair("y", 31, 0, 15, 34, 1, 15); // made up data, no variation at primers or in product - - // Correct primer attributes - const vector left_primers_sequences { - "TGCCTGGCATAGAGGAAAGC", "GAGTCGAGGCTCAAGGACAG", "CAGAGTCGAGGCTCAAGGAC", - "GAGGCTCAAGGACAGCTCTC", "TCCAGAAGCTGCTCTTTCCC", "AGCCAGACAAATCTGGGTTC", - "CAACTGGTAGTTACT" - }; - - const vector left_primers_positions { - 362, 620, 618, 625, 819, 181, 388 - }; - - const vector left_primers_lengths { - 20, 20, 20, 20, 20, 20, 15 - }; - - const vector left_primers_nodes_count { - 2, 1, 1, 2, 2, 6, 1 - }; - - const vector right_primers_sequences { - "GCCAGAAGAGCCTCAAGGAG", "AGGAGAGCTGGGAAAAGGGA", "AGGAGAGCTGGGAAAAGGGA", - "AGGAGAGCTGGGAAAAGGGA", "GCCTGGGTAGCTTTGGATGT", "AGATAATTAAACTGAAGTTC", - "GTTGACAATGAAAAG" - }; - - const vector right_primers_positions { - 466, 745, 745, 745, 935, 260, 485 - }; - - const vector right_primers_lengths { - 20, 20, 20, 20, 20, 20, 15 - }; - - const vector right_primers_nodes_count { - 2, 1, 1, 1, 2, 3, 1 - }; - - const vector min_product_sizes { - 124, 142, 144, 137, 135, 97, 112 - }; - - const vector max_product_sizes { - 124, 145, 147, 140, 138, 100, 112 - }; - - const vector linear_product_sizes { - 124, 145, 147, 140, 136, 99, 112 - }; - - const vector no_variations_at_primers { - true, true, true, true, true, false, true - }; - - const vector no_variations_in_products { - false, false, false, false, false, false, true - }; - - const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); - - REQUIRE(primer_pairs.size() == left_primers_sequences.size()); - for (size_t i = 0; i < primer_pairs.size(); ++i) { - REQUIRE(left_primers_nodes_count[i] == primer_pairs[i].left_primer.mapped_nodes_ids.size()); - REQUIRE(left_primers_sequences[i] == primer_pairs[i].left_primer.sequence); - REQUIRE(left_primers_positions[i] == primer_pairs[i].left_primer.position); - REQUIRE(left_primers_lengths[i] == primer_pairs[i].left_primer.length); - REQUIRE(right_primers_nodes_count[i] == primer_pairs[i].right_primer.mapped_nodes_ids.size()); - REQUIRE(right_primers_sequences[i] == primer_pairs[i].right_primer.sequence); - REQUIRE(right_primers_positions[i] == primer_pairs[i].right_primer.position); - REQUIRE(right_primers_lengths[i] == primer_pairs[i].right_primer.length); - REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); - REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); - REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); - REQUIRE(no_variations_at_primers[i] == primer_pairs[i].no_variation_at_primers); - REQUIRE(no_variations_in_products[i] == primer_pairs[i].no_variation_in_products); + SECTION("Loads the correct number of primer pairs") { + REQUIRE(primer_finder.get_primer_pairs_of_chrom("y").size() == 5); } - SECTION("Check that primers are assigned with correct nodes") { - vector pair_0_left_primer_nodes {27, 28}; - for (size_t i = 0; i < primer_pairs[0].left_primer.mapped_nodes_ids.size(); i++) { - REQUIRE(primer_pairs[0].left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); + SECTION("Loads and processes the primers correctly") { + primer_finder.add_primer_pair("y", 9, 14, 20, 22, 0, 20); // made up data, variation both at primers and in product + primer_finder.add_primer_pair("y", 31, 0, 15, 34, 1, 15); // made up data, no variation at primers or in product + + // Correct primer attributes + const vector left_primers_sequences { + "TGCCTGGCATAGAGGAAAGC", "GAGTCGAGGCTCAAGGACAG", "CAGAGTCGAGGCTCAAGGAC", + "GAGGCTCAAGGACAGCTCTC", "TCCAGAAGCTGCTCTTTCCC", "AGCCAGACAAATCTGGGTTC", + "CAACTGGTAGTTACT" + }; + + const vector left_primers_positions { + 362, 620, 618, 625, 819, 181, 388 + }; + + const vector left_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector left_primers_nodes_count { + 2, 1, 1, 2, 2, 6, 1 + }; + + const vector right_primers_sequences { + "GCCAGAAGAGCCTCAAGGAG", "AGGAGAGCTGGGAAAAGGGA", "AGGAGAGCTGGGAAAAGGGA", + "AGGAGAGCTGGGAAAAGGGA", "GCCTGGGTAGCTTTGGATGT", "AGATAATTAAACTGAAGTTC", + "GTTGACAATGAAAAG" + }; + + const vector right_primers_positions { + 466, 745, 745, 745, 935, 260, 485 + }; + + const vector right_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector right_primers_nodes_count { + 2, 1, 1, 1, 2, 3, 1 + }; + + const vector min_product_sizes { + 124, 142, 144, 137, 135, 97, 112 + }; + + const vector max_product_sizes { + 124, 145, 147, 140, 138, 100, 112 + }; + + const vector linear_product_sizes { + 124, 145, 147, 140, 136, 99, 112 + }; + + const vector no_variations_at_primers { + true, true, true, true, true, false, true + }; + + const vector no_variations_in_products { + false, false, false, false, false, false, true + }; + + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); + + REQUIRE(primer_pairs.size() == left_primers_sequences.size()); + for (size_t i = 0; i < primer_pairs.size(); ++i) { + REQUIRE(left_primers_nodes_count[i] == primer_pairs[i].left_primer.mapped_nodes_ids.size()); + REQUIRE(left_primers_sequences[i] == primer_pairs[i].left_primer.sequence); + REQUIRE(left_primers_positions[i] == primer_pairs[i].left_primer.position_chromosome); + REQUIRE(left_primers_lengths[i] == primer_pairs[i].left_primer.length); + REQUIRE(right_primers_nodes_count[i] == primer_pairs[i].right_primer.mapped_nodes_ids.size()); + REQUIRE(right_primers_sequences[i] == primer_pairs[i].right_primer.sequence); + REQUIRE(right_primers_positions[i] == primer_pairs[i].right_primer.position_chromosome); + REQUIRE(right_primers_lengths[i] == primer_pairs[i].right_primer.length); + REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); + REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); + REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); + REQUIRE(no_variations_at_primers[i] == primer_pairs[i].no_variation_at_primers); + REQUIRE(no_variations_in_products[i] == primer_pairs[i].no_variation_in_products); } - vector pair_0_right_primer_nodes {33, 34}; - for (size_t i = 0; i < primer_pairs[0].right_primer.mapped_nodes_ids.size(); i++) { - REQUIRE(primer_pairs[0].right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); + SECTION("Check that primers are assigned with correct nodes") { + vector pair_0_left_primer_nodes {27, 28}; + for (size_t i = 0; i < primer_pairs[0].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); + } + + vector pair_0_right_primer_nodes {33, 34}; + for (size_t i = 0; i < primer_pairs[0].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); + } + + vector pair_5_left_primer_nodes {9, 11, 12, 14, 15, 17}; + for (size_t i = 0; i < primer_pairs[5].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); + } + + vector pair_5_right_primer_nodes {22, 24, 25}; + for (size_t i = 0; i < primer_pairs[5].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); + } } - vector pair_5_left_primer_nodes {9, 11, 12, 14, 15, 17}; - for (size_t i = 0; i < primer_pairs[5].left_primer.mapped_nodes_ids.size(); i++) { - REQUIRE(primer_pairs[5].left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); + } + } + + SECTION("template_position=11") { + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3_with_ref_pos11.out"; + ifstream file_handle(primers_path); + PrimerFinder primer_finder(graph, &distance_index, file_handle); + + SECTION("Loads the correct number of chromosomes") { + REQUIRE(primer_finder.total_reference_paths() == 1); + } + + SECTION("Loads the correct number of primer pairs") { + REQUIRE(primer_finder.get_primer_pairs_of_chrom("y").size() == 5); + } + + SECTION("Loads and processes the primers correctly") { + primer_finder.add_primer_pair("y", 9, 14, 20, 22, 0, 20); // made up data, variation both at primers and in product + primer_finder.add_primer_pair("y", 31, 0, 15, 34, 1, 15); // made up data, no variation at primers or in product + + // Correct primer attributes + const vector left_primers_sequences { + "TGCCTGGCATAGAGGAAAGC", "GAGTCGAGGCTCAAGGACAG", "CAGAGTCGAGGCTCAAGGAC", + "GAGGCTCAAGGACAGCTCTC", "TCCAGAAGCTGCTCTTTCCC", "AGCCAGACAAATCTGGGTTC", + "CAACTGGTAGTTACT" + }; + + const vector left_primers_positions { + 362, 620, 618, 625, 819, 181, 388 + }; + + const vector left_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector left_primers_nodes_count { + 2, 1, 1, 2, 2, 6, 1 + }; + + const vector right_primers_sequences { + "GCCAGAAGAGCCTCAAGGAG", "AGGAGAGCTGGGAAAAGGGA", "AGGAGAGCTGGGAAAAGGGA", + "AGGAGAGCTGGGAAAAGGGA", "GCCTGGGTAGCTTTGGATGT", "AGATAATTAAACTGAAGTTC", + "GTTGACAATGAAAAG" + }; + + const vector right_primers_positions { + 466, 745, 745, 745, 935, 260, 485 + }; + + const vector right_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector right_primers_nodes_count { + 2, 1, 1, 1, 2, 3, 1 + }; + + const vector min_product_sizes { + 124, 142, 144, 137, 135, 97, 112 + }; + + const vector max_product_sizes { + 124, 145, 147, 140, 138, 100, 112 + }; + + const vector linear_product_sizes { + 124, 145, 147, 140, 136, 99, 112 + }; + + const vector no_variations_at_primers { + true, true, true, true, true, false, true + }; + + const vector no_variations_in_products { + false, false, false, false, false, false, true + }; + + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); + + REQUIRE(primer_pairs.size() == left_primers_sequences.size()); + for (size_t i = 0; i < primer_pairs.size(); ++i) { + REQUIRE(left_primers_nodes_count[i] == primer_pairs[i].left_primer.mapped_nodes_ids.size()); + REQUIRE(left_primers_sequences[i] == primer_pairs[i].left_primer.sequence); + REQUIRE(left_primers_positions[i] == primer_pairs[i].left_primer.position_chromosome); + REQUIRE(left_primers_lengths[i] == primer_pairs[i].left_primer.length); + REQUIRE(right_primers_nodes_count[i] == primer_pairs[i].right_primer.mapped_nodes_ids.size()); + REQUIRE(right_primers_sequences[i] == primer_pairs[i].right_primer.sequence); + REQUIRE(right_primers_positions[i] == primer_pairs[i].right_primer.position_chromosome); + REQUIRE(right_primers_lengths[i] == primer_pairs[i].right_primer.length); + REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); + REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); + REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); + REQUIRE(no_variations_at_primers[i] == primer_pairs[i].no_variation_at_primers); + REQUIRE(no_variations_in_products[i] == primer_pairs[i].no_variation_in_products); } - vector pair_5_right_primer_nodes {22, 24, 25}; - for (size_t i = 0; i < primer_pairs[5].right_primer.mapped_nodes_ids.size(); i++) { - REQUIRE(primer_pairs[5].right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); + SECTION("Check that primers are assigned with correct nodes") { + vector pair_0_left_primer_nodes {27, 28}; + for (size_t i = 0; i < primer_pairs[0].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); + } + + vector pair_0_right_primer_nodes {33, 34}; + for (size_t i = 0; i < primer_pairs[0].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); + } + + vector pair_5_left_primer_nodes {9, 11, 12, 14, 15, 17}; + for (size_t i = 0; i < primer_pairs[5].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); + } + + vector pair_5_right_primer_nodes {22, 24, 25}; + for (size_t i = 0; i < primer_pairs[5].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); + } } - } + } } - } } } \ No newline at end of file From 6e26ecb8218dc05db1f5b21e1793b642afcfc16b Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Mon, 23 Oct 2023 16:57:41 -0700 Subject: [PATCH 20/43] integration test --- test/t/55_vg_primers.t | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 test/t/55_vg_primers.t diff --git a/test/t/55_vg_primers.t b/test/t/55_vg_primers.t new file mode 100644 index 00000000000..9c4a611e9f9 --- /dev/null +++ b/test/t/55_vg_primers.t @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=../deps/bash-tap +. ../deps/bash-tap/bash-tap-bootstrap + +PATH=../bin:$PATH # for vg + +plan tests 7 + +# make graph and snarl dist index +vg construct -r small/y.fa -v small/y.vcf.gz > y.vg + +vg convert -x y.vg > y.xg + +vg index -j y.dist y.vg + +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist | wc -l) 6 "Get the expected number of primer pairs" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -a | wc -l) 6 "Get the expected number of primer pairs using --all-primers tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -z | wc -l) 1 "Get the expected number of primer pairs using --zero-variation tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -l 2 | wc -l) 3 "Get the expected number of primer pairs using --tolerance tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -m 140 | wc -l) 4 "Get the expected number of primer pairs using --maximum-size tag" + +vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist > y.ref_pos_0.out +vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -s y.dist > y.ref_pos_11.out +diff -q <(awk '{$2=$5=$6=""; print $0}' y.ref_pos_0.out) <(awk '{$2=$5=$6=""; print $0}' y.ref_pos_11.out) > diff_0_11 +is $(cat diff_0_11 | wc -l) 0 "These two output files should have identical primers except for their positions on template" + +# clean up +rm diff_0_11 +rm y.vg y.xg y.dist +rm y.ref_pos_0.out y.ref_pos_11.out \ No newline at end of file From 086b9a1e086e9822588360ff891bbd1fbb157374 Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Tue, 24 Oct 2023 14:13:37 -0700 Subject: [PATCH 21/43] up --- src/primer_filter.cpp | 2 +- src/primer_filter.hpp | 2 +- src/subcommand/primers_main.cpp | 97 ++++++++++++++++++--------------- src/unittest/primer_filter.cpp | 2 +- test/t/55_vg_primers.t | 9 ++- 5 files changed, 64 insertions(+), 48 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index ccab3d2dd03..3d6a34f2f4f 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -262,7 +262,7 @@ void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_ while (true) { size_t depth = distance_index->get_depth(cur_net_handle); if (depth != 1) { - if (primer_nodes_set.find(cur_node_id) != primer_nodes_set.end()) { + if (primer_nodes_set.find(cur_node_id) != primer_nodes_set.end()) { primer_pair.no_variation_at_primers = false; primer_pair.no_variation_in_products = false; break; diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index e6fbc98a4d9..a33906a5dd2 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -66,7 +66,7 @@ class PrimerFinder { private: // vector primer_pairs; // vector selected_primer_pairs; - map> chroms; // map containing a vector of primer pairs for each chromosome + unordered_map> chroms; // map containing a vector of primer pairs for each chromosome const PathPositionHandleGraph* graph; const SnarlDistanceIndex* distance_index; diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 2b23e489b0b..eedb2a80c52 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -37,20 +37,26 @@ size_t difference(const size_t& a, const size_t& b) { return diff; } -// void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { -// const Primer& left_primer = primer_pair.left_primer; -// const Primer& right_primer = primer_pair.right_primer; -// cout << genome_name << "\t"; -// cout << left_primer.sequence << "\t" << right_primer.sequence << "\t" -// << left_primer.template_position << "\t" << right_primer.template_position << "\t" -// << left_primer.length << "\t" << right_primer.length << "\t" -// << primer_pair.linear_product_size << "\t" -// << primer_pair.min_product_size << "\t" -// << primer_pair.max_product_size << "\t" -// << primer_pair.no_variation_at_primers << "\t" -// << primer_pair.no_variation_in_products << endl; - -// } +void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { + const Primer& left_primer = primer_pair.left_primer; + const Primer& right_primer = primer_pair.right_primer; + cout << genome_name << "\t" + << primer_pair.template_position << "\t" + << left_primer.sequence << "\t" + << right_primer.sequence << "\t" + << left_primer.position_template << "\t" + << right_primer.position_template << "\t" + << left_primer.position_chromosome << "\t" + << right_primer.position_chromosome << "\t" + << left_primer.length << "\t" + << right_primer.length << "\t" + << primer_pair.linear_product_size << "\t" + << primer_pair.min_product_size << "\t" + << primer_pair.max_product_size << "\t" + << primer_pair.no_variation_at_primers << "\t" + << primer_pair.no_variation_in_products << endl; + +} int main_primers(int argc, char** argv) { @@ -150,36 +156,39 @@ int main_primers(int argc, char** argv) { ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle); - // vector reference_paths = primer_finder.get_reference_paths(); - // for (size_t i = 0; i < reference_paths.size(); ++i) { - // string path_name = reference_paths[i]; - // const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom(path_name); - // for (size_t j = 0; j < primer_pairs.size(); ++j) { - // const PrimerPair& primer_pair = primer_pairs[j]; - // if (all_primers) { - // print_tabular(path_name, primer_pair); - // } else if (zero_variation) { - // if (primer_pair.no_variation_in_products) { - // print_tabular(path_name, primer_pair); - // } - // } else if (primer_pair.no_variation_at_primers) { - // if (minimum_product_size != numeric_limits::max() && - // primer_pair.min_product_size < minimum_product_size) { - // continue; - // } - // if (maximum_product_size != numeric_limits::max() && - // primer_pair.max_product_size > maximum_product_size) { - // continue; - // } - // if (difference(primer_pair.linear_product_size, primer_pair.min_product_size) > tolerance - // || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { - // continue; - // } - - // print_tabular(path_name, primer_pair); - // } - // } - // } + cout << "chrom\ttplpos\tlpseq\trpseq\tlppostpl\trppostmp\tlpposchrom\trpposchrom\t" + << "lplen\trplen\tlinsize\tminsize\tmaxsize\tnvprimers\tnvproducts" << endl; + + vector reference_paths = primer_finder.get_reference_paths(); + for (size_t i = 0; i < reference_paths.size(); ++i) { + string path_name = reference_paths[i]; + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom(path_name); + for (size_t j = 0; j < primer_pairs.size(); ++j) { + const PrimerPair& primer_pair = primer_pairs[j]; + if (all_primers) { + print_tabular(path_name, primer_pair); + } else if (zero_variation) { + if (primer_pair.no_variation_in_products) { + print_tabular(path_name, primer_pair); + } + } else if (primer_pair.no_variation_at_primers) { + if (minimum_product_size != numeric_limits::max() && + primer_pair.min_product_size < minimum_product_size) { + continue; + } + if (maximum_product_size != numeric_limits::max() && + primer_pair.max_product_size > maximum_product_size) { + continue; + } + if (difference(primer_pair.linear_product_size, primer_pair.min_product_size) > tolerance + || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { + continue; + } + + print_tabular(path_name, primer_pair); + } + } + } return 0; } diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 076ce633dc0..ee888ef45f2 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -159,7 +159,7 @@ using namespace std; } SECTION("template_position=11") { - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3_with_ref_pos11.out"; + string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3_with_ref_pos_11.out"; ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle); diff --git a/test/t/55_vg_primers.t b/test/t/55_vg_primers.t index 9c4a611e9f9..252116f5c80 100644 --- a/test/t/55_vg_primers.t +++ b/test/t/55_vg_primers.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 7 +plan tests 13 # make graph and snarl dist index vg construct -r small/y.fa -v small/y.vcf.gz > y.vg @@ -21,6 +21,13 @@ is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -l 2 | wc is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -m 140 | wc -l) 4 "Get the expected number of primer pairs using --maximum-size tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist | wc -l) 9 "Get the expected number of primer pairs" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -a | wc -l) 11 "Get the expected number of primer pairs using --all-primers tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -z | wc -l) 1 "Get the expected number of primer pairs using --zero-variation tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -l 2 | wc -l) 6 "Get the expected number of primer pairs using --tolerance tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -m 140 | wc -l) 7 "Get the expected number of primer pairs using --maximum-size tag" + vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist > y.ref_pos_0.out vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -s y.dist > y.ref_pos_11.out diff -q <(awk '{$2=$5=$6=""; print $0}' y.ref_pos_0.out) <(awk '{$2=$5=$6=""; print $0}' y.ref_pos_11.out) > diff_0_11 From ef9b18d9c2e64307ab44f7e59f787b1eedf1737b Mon Sep 17 00:00:00 2001 From: Alan Zhang Date: Tue, 24 Oct 2023 15:11:17 -0700 Subject: [PATCH 22/43] up --- src/unittest/primer_filter.cpp | 8 +- test/primers/y.dist | Bin 0 -> 3072 bytes test/primers/y.primer3_with_ref_pos.out | 97 +++++++++++++++ test/primers/y.primer3_with_ref_pos_11.out | 97 +++++++++++++++ test/primers/y.primer3config.split | 24 ++++ test/primers/y.ref_pos_0.out | 6 + test/primers/y.ref_pos_11.out | 6 + test/primers/y.split.out | 134 +++++++++++++++++++++ test/primers/y.xg | Bin 0 -> 5468 bytes 9 files changed, 368 insertions(+), 4 deletions(-) create mode 100644 test/primers/y.dist create mode 100644 test/primers/y.primer3_with_ref_pos.out create mode 100644 test/primers/y.primer3_with_ref_pos_11.out create mode 100644 test/primers/y.primer3config.split create mode 100644 test/primers/y.ref_pos_0.out create mode 100644 test/primers/y.ref_pos_11.out create mode 100644 test/primers/y.split.out create mode 100644 test/primers/y.xg diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index ee888ef45f2..3a38aebbe29 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -35,13 +35,13 @@ using namespace std; SnarlDistanceIndex distance_index; unique_ptr graph; - string snarl_index_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.dist"; - string xg_graph_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.xg"; + string snarl_index_path = "test/primers/y.dist"; + string xg_graph_path = "test/primers/y.xg"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); SECTION("template_position=0") { - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3_with_ref_pos.out"; + string primers_path = "test/primers/y.primer3_with_ref_pos.out"; ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle); @@ -159,7 +159,7 @@ using namespace std; } SECTION("template_position=11") { - string primers_path = "/home/azhang/rotations/rotation_1/vg/alan/small/y.primer3_with_ref_pos_11.out"; + string primers_path = "test/primers/y.primer3_with_ref_pos_11.out"; ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle); diff --git a/test/primers/y.dist b/test/primers/y.dist new file mode 100644 index 0000000000000000000000000000000000000000..0113e77172bce5deb8d98e1d55cd2337d9cdb61c GIT binary patch literal 3072 zcmeH|PiP%g6vn@q_lz}Z(rHa>YWy?AKSZbyYN2ss@1jT%lqVDyQsfmxH&VKAVL|Av zzC|cqL}|h9QV~&CL0nXDF$EDfLKa=r-BSd2l7*m%Xn*J2naRsTknVFxX6C!+JLi1& zoICU0y#DTYYf6^Y_c(@E#_+$%F4W6va1-C~U&|BY!G>7h{}{V}(n$+V{CK_wH)&4P z{08sdcRR4#fxGL#F#dD-`B&7wyZYxga_Pur9TwST`DkQv=YwbRNye9Ji zBJgu*FJRKam+mj8JO;3b1XHS5{nlw=uhV{T+S;^(AKg>V_}XtC)7$4D%Pc`G9Lze{ z*zaH=a|H)H!Bd$jDDQXA5%&Nh5P&Dlx`Gp)U~|@KFFFliGyt@%IR|g}H>v-CZvlV; zKn3j`r^SO#yXdrH-e~}6#Ub~63euQCyV-z*;Ow)h>tt?)nm>*;xPcS z0V;d|_P_C{r_McQSUQ+!GRhRc^^ErQlRq%hwX};22Xn$DJ z`W`Qenr`E;yJaJcAG9Noq4E zDSsqAAbsPY;#$n8Tk2!vEd_x*SXswSn6tJ{I+q733w%W*6Ya`*` zxwfWQXl#g9o~dI; zQw%h{St(yhG!>45GW7w>XGQu7Rhdtrolhhc+TQlkm8nL&A0(K1Khyj~O6k`>xxzLM z1t8$0?s>;OoP!Qwf$l8lJ_Tzc%k4qhqFl*BW04La^aU`&00@1F6Hn?fq4q9NdfnqI z{5wv3zx}wS%smJwnDg>|B&8E?a#l=$&Gc#Ch^eMQX{LDWgZKr7T06~TIHrX1k>QUq zZl9{k5XDY)1C~vycch@UCrx zEl?pHOE$MgUv7+MS%AP29MV4|^`-9tjO}6R1L>r{7~7-Y=vc$-&@XXIqlf?5w8~>F zG-fA{<3rzebo6(x76{nU(f2w!Y@eIPw){n~j;5K?Dx2wX*VRj`G}C2z{j+q1X15ku zqYhiJIO%oDol(UwzV;s$5`>mq@-)bFMEYgI_=3z|vdFEN&@KA5;4zdTZEes3KQSGJ avmS$|Y<$+u;tA>>>>>>>>>>>>>>>>>>> + + 420 CTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCT + <<<<<<<<<<<<<< + + 480 TCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGT + <<<<<< + + 540 AGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTA + + + 600 AGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGT + + + 660 GTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTT + + + 720 TTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGAT + + + 780 CTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCT + + + 840 CAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAG + + + 900 CTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTT + + + 960 TAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA + + +KEYS (in order of precedence): +>>>>>> left primer +<<<<<< right primer + +ADDITIONAL OLIGOS + start len tm gc% any_th 3'_th hairpin seq + + 1 LEFT PRIMER 620 20 59.83 60.00 15.95 0.00 35.67 GAGTCGAGGCTCAAGGACAG + RIGHT PRIMER 764 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 145, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 2 LEFT PRIMER 618 20 59.83 60.00 15.95 13.47 37.94 CAGAGTCGAGGCTCAAGGAC + RIGHT PRIMER 764 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 147, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 3 LEFT PRIMER 625 20 59.82 60.00 10.25 0.00 37.05 GAGGCTCAAGGACAGCTCTC + RIGHT PRIMER 764 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 140, PAIR ANY_TH COMPL: 26.57, PAIR 3'_TH COMPL: 26.57 + + 4 LEFT PRIMER 819 20 59.67 55.00 1.00 0.00 34.56 TCCAGAAGCTGCTCTTTCCC + RIGHT PRIMER 954 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT + PRODUCT SIZE: 136, PAIR ANY_TH COMPL: 13.72, PAIR 3'_TH COMPL: 10.53 + +Statistics + con too in in not no tm tm high high high high + sid many tar excl ok bad GC too too any_th 3'_th hair- poly end + ered Ns get reg reg GC% clamp low high compl compl pin X stab ok +Left 4586 0 0 0 0 41 0 3329 93 0 0 6 15 0 1102 +Right 4585 0 0 0 0 40 0 3257 106 0 0 0 15 0 1167 +Pair Stats: +considered 106, unacceptable product size 101, ok 5 +libprimer3 release 2.6.1 + + diff --git a/test/primers/y.primer3_with_ref_pos_11.out b/test/primers/y.primer3_with_ref_pos_11.out new file mode 100644 index 00000000000..8843691fccf --- /dev/null +++ b/test/primers/y.primer3_with_ref_pos_11.out @@ -0,0 +1,97 @@ +PRIMER PICKING RESULTS FOR y,11 + +No mispriming library specified +Using 0-based sequence positions +OLIGO start len tm gc% any_th 3'_th hairpin seq +LEFT PRIMER 351 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC +RIGHT PRIMER 474 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG +SEQUENCE SIZE: 990 +INCLUDED REGION SIZE: 990 + +PRODUCT SIZE: 124, PAIR ANY_TH COMPL: 6.57, PAIR 3'_TH COMPL: 4.13 + + 0 TGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAAC + + + 60 TAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGAC + + + 120 AATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAA + + + 180 ATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTT + + + 240 AATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGA + + + 300 GGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCA + >>>>>>>>> + + 360 TAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCT + >>>>>>>>>>> + + 420 TGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTC + <<<<<<<<<<<<<<<<<<<< + + 480 ATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGAT + + + 540 CTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGG + + + 600 GCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGT + + + 660 AACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACA + + + 720 TGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGG + + + 780 CCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCAT + + + 840 TTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCC + + + 900 TCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTG + + + 960 TGGACTAAGGACAAAGGTGCGGGGAGATGA + + +KEYS (in order of precedence): +>>>>>> left primer +<<<<<< right primer + +ADDITIONAL OLIGOS + start len tm gc% any_th 3'_th hairpin seq + + 1 LEFT PRIMER 609 20 59.83 60.00 15.95 0.00 35.67 GAGTCGAGGCTCAAGGACAG + RIGHT PRIMER 753 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 145, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 2 LEFT PRIMER 607 20 59.83 60.00 15.95 13.47 37.94 CAGAGTCGAGGCTCAAGGAC + RIGHT PRIMER 753 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 147, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 3 LEFT PRIMER 614 20 59.82 60.00 10.25 0.00 37.05 GAGGCTCAAGGACAGCTCTC + RIGHT PRIMER 753 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 140, PAIR ANY_TH COMPL: 26.57, PAIR 3'_TH COMPL: 26.57 + + 4 LEFT PRIMER 808 20 59.67 55.00 1.00 0.00 34.56 TCCAGAAGCTGCTCTTTCCC + RIGHT PRIMER 943 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT + PRODUCT SIZE: 136, PAIR ANY_TH COMPL: 13.72, PAIR 3'_TH COMPL: 10.53 + +Statistics + con too in in not no tm tm high high high high + sid many tar excl ok bad GC too too any_th 3'_th hair- poly end + ered Ns get reg reg GC% clamp low high compl compl pin X stab ok +Left 4531 0 0 0 0 41 0 3277 93 0 0 6 15 0 1099 +Right 4530 0 0 0 0 40 0 3203 106 0 0 0 15 0 1166 +Pair Stats: +considered 106, unacceptable product size 101, ok 5 +libprimer3 release 2.6.1 + + diff --git a/test/primers/y.primer3config.split b/test/primers/y.primer3config.split new file mode 100644 index 00000000000..e34445c97b2 --- /dev/null +++ b/test/primers/y.primer3config.split @@ -0,0 +1,24 @@ +SEQUENCE_ID=y,100 +SEQUENCE_TEMPLATE=TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 += +SEQUENCE_ID=y,601 +SEQUENCE_TEMPLATE=GACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 += diff --git a/test/primers/y.ref_pos_0.out b/test/primers/y.ref_pos_0.out new file mode 100644 index 00000000000..e051f4c85d5 --- /dev/null +++ b/test/primers/y.ref_pos_0.out @@ -0,0 +1,6 @@ +chrom tplpos lpseq rpseq lppostpl rppostmp lpposchrom rpposchrom lplen rplen linsize minsize maxsize nvprimers nvproducts +y 0 TGCCTGGCATAGAGGAAAGC GCCAGAAGAGCCTCAAGGAG 362 466 362 466 20 20 124 124 124 1 0 +y 0 GAGTCGAGGCTCAAGGACAG AGGAGAGCTGGGAAAAGGGA 620 745 620 745 20 20 145 142 145 1 0 +y 0 CAGAGTCGAGGCTCAAGGAC AGGAGAGCTGGGAAAAGGGA 618 745 618 745 20 20 147 144 147 1 0 +y 0 GAGGCTCAAGGACAGCTCTC AGGAGAGCTGGGAAAAGGGA 625 745 625 745 20 20 140 137 140 1 0 +y 0 TCCAGAAGCTGCTCTTTCCC GCCTGGGTAGCTTTGGATGT 819 935 819 935 20 20 136 135 138 1 0 diff --git a/test/primers/y.ref_pos_11.out b/test/primers/y.ref_pos_11.out new file mode 100644 index 00000000000..0f32344e26e --- /dev/null +++ b/test/primers/y.ref_pos_11.out @@ -0,0 +1,6 @@ +chrom tplpos lpseq rpseq lppostpl rppostmp lpposchrom rpposchrom lplen rplen linsize minsize maxsize nvprimers nvproducts +y_x 11 TGCCTGGCATAGAGGAAAGC GCCAGAAGAGCCTCAAGGAG 351 455 362 466 20 20 124 124 124 1 0 +y 11 GAGTCGAGGCTCAAGGACAG AGGAGAGCTGGGAAAAGGGA 609 734 620 745 20 20 145 142 145 1 0 +y 11 CAGAGTCGAGGCTCAAGGAC AGGAGAGCTGGGAAAAGGGA 607 734 618 745 20 20 147 144 147 1 0 +y 11 GAGGCTCAAGGACAGCTCTC AGGAGAGCTGGGAAAAGGGA 614 734 625 745 20 20 140 137 140 1 0 +y 11 TCCAGAAGCTGCTCTTTCCC GCCTGGGTAGCTTTGGATGT 808 924 819 935 20 20 136 135 138 1 0 diff --git a/test/primers/y.split.out b/test/primers/y.split.out new file mode 100644 index 00000000000..573628f03c5 --- /dev/null +++ b/test/primers/y.split.out @@ -0,0 +1,134 @@ +PRIMER PICKING RESULTS FOR y,100 + +No mispriming library specified +Using 0-based sequence positions +OLIGO start len tm gc% any_th 3'_th hairpin seq +LEFT PRIMER 262 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC +RIGHT PRIMER 385 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG +SEQUENCE SIZE: 401 +INCLUDED REGION SIZE: 401 + +PRODUCT SIZE: 124, PAIR ANY_TH COMPL: 6.57, PAIR 3'_TH COMPL: 4.13 + + 0 TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAA + + + 60 TGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACA + + + 120 TATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCT + + + 180 CTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGAT + + + 240 GCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTT + >>>>>>>>>>>>>>>>>>>> + + 300 ACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACC + + + 360 TCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACA + <<<<<<<<<<<<<<<<<<<< + +KEYS (in order of precedence): +>>>>>> left primer +<<<<<< right primer + +ADDITIONAL OLIGOS + start len tm gc% any_th 3'_th hairpin seq + + 1 LEFT PRIMER 262 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC + RIGHT PRIMER 387 20 59.67 55.00 0.00 0.00 38.53 AAGCCAGAAGAGCCTCAAGG + PRODUCT SIZE: 126, PAIR ANY_TH COMPL: 11.71, PAIR 3'_TH COMPL: 4.81 + + 2 LEFT PRIMER 262 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC + RIGHT PRIMER 386 20 60.55 55.00 0.00 0.00 37.39 AGCCAGAAGAGCCTCAAGGA + PRODUCT SIZE: 125, PAIR ANY_TH COMPL: 11.71, PAIR 3'_TH COMPL: 0.00 + + 3 LEFT PRIMER 274 21 59.93 52.38 0.00 0.00 42.19 AGGAAAGCCTCTGACAACTGG + RIGHT PRIMER 385 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG + PRODUCT SIZE: 112, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 4 LEFT PRIMER 275 21 60.20 52.38 0.00 0.00 0.00 GGAAAGCCTCTGACAACTGGT + RIGHT PRIMER 385 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG + PRODUCT SIZE: 111, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 6.95 + +Statistics + con too in in not no tm tm high high high high + sid many tar excl ok bad GC too too any_th 3'_th hair- poly end + ered Ns get reg reg GC% clamp low high compl compl pin X stab ok +Left 1635 0 0 0 0 20 0 1345 6 0 0 19 0 0 245 +Right 1635 0 0 0 0 16 0 1303 8 0 0 3 0 0 305 +Pair Stats: +considered 61, unacceptable product size 55, ok 6 +libprimer3 release 2.6.1 + + +PRIMER PICKING RESULTS FOR y,601 + +No mispriming library specified +Using 0-based sequence positions +OLIGO start len tm gc% any_th 3'_th hairpin seq +LEFT PRIMER 19 20 59.83 60.00 15.95 0.00 35.67 GAGTCGAGGCTCAAGGACAG +RIGHT PRIMER 163 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA +SEQUENCE SIZE: 400 +INCLUDED REGION SIZE: 400 + +PRODUCT SIZE: 145, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 0 GACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTG + >>>>>>>>>>>>>>>>>>>> + + 60 TATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTT + + + 120 TATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATC + <<<<<<<<<<<<<<<<<<<< + + 180 TCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTC + + + 240 AATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGC + + + 300 TTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTT + + + 360 AAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA + + +KEYS (in order of precedence): +>>>>>> left primer +<<<<<< right primer + +ADDITIONAL OLIGOS + start len tm gc% any_th 3'_th hairpin seq + + 1 LEFT PRIMER 17 20 59.83 60.00 15.95 13.47 37.94 CAGAGTCGAGGCTCAAGGAC + RIGHT PRIMER 163 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 147, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 + + 2 LEFT PRIMER 24 20 59.82 60.00 10.25 0.00 37.05 GAGGCTCAAGGACAGCTCTC + RIGHT PRIMER 163 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA + PRODUCT SIZE: 140, PAIR ANY_TH COMPL: 26.57, PAIR 3'_TH COMPL: 26.57 + + 3 LEFT PRIMER 218 20 59.67 55.00 1.00 0.00 34.56 TCCAGAAGCTGCTCTTTCCC + RIGHT PRIMER 353 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT + PRODUCT SIZE: 136, PAIR ANY_TH COMPL: 13.72, PAIR 3'_TH COMPL: 10.53 + + 4 LEFT PRIMER 219 20 59.67 55.00 1.00 0.00 34.56 CCAGAAGCTGCTCTTTCCCT + RIGHT PRIMER 353 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT + PRODUCT SIZE: 135, PAIR ANY_TH COMPL: 12.02, PAIR 3'_TH COMPL: 3.44 + +Statistics + con too in in not no tm tm high high high high + sid many tar excl ok bad GC too too any_th 3'_th hair- poly end + ered Ns get reg reg GC% clamp low high compl compl pin X stab ok +Left 1630 0 0 0 0 0 0 1013 75 0 0 1 0 0 541 +Right 1630 0 0 0 0 0 0 1066 37 0 0 0 0 0 527 +Pair Stats: +considered 23, unacceptable product size 18, ok 5 +libprimer3 release 2.6.1 + + diff --git a/test/primers/y.xg b/test/primers/y.xg new file mode 100644 index 0000000000000000000000000000000000000000..4542c78671b744b8b0e073051427873981c21d9f GIT binary patch literal 5468 zcmeHLYfMx}6rTHFcY$5@B2gDy^@_NFkGLolN)hj_K&%o23RcC(`Y2sejcY3rtxaba zv`B1Sd{wATk$^=)qqWua5#!|{7HTWTSfQ3m3avKv2aS)uZ0*dQxr@bWXthb3aL91Z zoO8Z&&zw0IX6~(DKc+#N4FJAu*+QyK`ju65FP(T(i z4iV@?UVwgPJt$%rT?L~{5*#Uj;k%e1KXb4In5(jznF&P+Jmv%qj1vkH80CT>9K$r` z2-wOISd?ovfdzIAQD6-Bfk}i2Cb5FyHr5rxLQdvUfMi#%WMy?CG=zBp&o^RSxJzJS z3`l20x+4A@(OKQGL#dAUyFzpsOroe0kRNm~_F!IwTCvwvODmw2bDb$l|AsR9fcZ&Lg_Cnp(;M~(+wZK zDz|XlQyKe0Q_jXNZ~u78vNiGP%&GV_BagHg*F8Hq##q-ew*B;$C0QmzUS)OM-Z=5X zh`91S2jkZ6J&`>(vUq=&rKor8sd1f&f z4Xs}W&(Y?-cCtO*>ORB0eImP5KO`r5b?Kt&qh+5TI%F*ycdI;ge)|@7VRYm1g@^p} zpUmHz-#Vjk#oFA3k-Jte-&tcI@vfpT>oNarf|xk1`G0pJ>5z@wv~G?ca~$zKt1B{1RDV^Urv z#D^&}vUDom#blp9ZAqZT<-VIBozRu^nbarwkbNd1$yA{__O}DSdo)CG=MIB*P+d`k z&+85%Roj;ihdkBQs;(@+zN&U3RTC|z*s%TKh}z+Z(H%TN}&kaOZ^tBPSrxHXLHVXt58q)kOulQgUUS(>+XAp zfE01xib!e_)Dgr}p^X}tqNX$3LWPj2kq6;E*3y(-9BPfsmD11b-^awna4R%TihZh> zE5m&O0VbN#eGND>mYWZVQuCjtB$jo8YHCzA@tGi=|dZL zBzs<7uU&SvWmc0TD7Ep@5ik{b29$#(h#jU)mt z)1-wF@F+;{PRihChGZ|4w>W!7r73_pdA5PXc!KCxY=wf%%%?<$Q z^At!kuWKIj7I1tr3EcQP5XiCJXA*h@UP>L-v1-o}pieX9HI-g`^QBS0Tpb6Nmqgdp j9IPyvc)Wv^CxOmQ-yc4m{DMKg1kArD^c Date: Sat, 28 Oct 2023 16:22:47 -0700 Subject: [PATCH 23/43] update --- src/primer_filter.cpp | 14 +++++++++----- src/primer_filter.hpp | 1 + src/subcommand/primers_main.cpp | 11 +++++++---- test/primers/y.primer3_with_ref_pos.out | 2 +- test/primers/y.primer3_with_ref_pos_11.out | 2 +- test/primers/y.split.out | 4 ++-- test/t/55_vg_primers.t | 2 +- 7 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 3d6a34f2f4f..f24849818eb 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -53,6 +53,7 @@ void PrimerFinder::load_primers(ifstream& file_handle) { vector cur_fields; size_t cur_template_offset; string cur_template_info; + string cur_template_feature; string cur_path; string line; while (getline(file_handle, line)) { @@ -65,14 +66,16 @@ void PrimerFinder::load_primers(ifstream& file_handle) { assert(chroms[cur_path].back().right_primer.sequence.empty()); chroms[cur_path].pop_back(); } - cur_fields = move(split(line)); - cur_template_info = cur_fields[cur_fields.size()-1]; - cur_fields = move(split(cur_template_info,',')); - cur_template_offset = stoi(cur_fields[1]); - cur_path = cur_fields[0]; + cur_fields = move(split(line)); + cur_template_info = cur_fields[cur_fields.size()-1]; + cur_fields = move(split(cur_template_info,'|')); + cur_template_feature = cur_fields[1] + "|" + cur_fields[2]; + cur_template_offset = stoi(cur_fields[3]); + cur_path = cur_fields[0]; chroms[cur_path].emplace_back(); chroms[cur_path].back().chromosome_name = cur_path; chroms[cur_path].back().template_position = cur_template_offset; + chroms[cur_path].back().template_feature = cur_template_feature; chroms[cur_path].back().right_primer.left = false; } else if (left_primer_line_start != string::npos) { cur_fields = move(split(line.substr(left_primer_line_start, line.size()))); @@ -99,6 +102,7 @@ void PrimerFinder::load_primers(ifstream& file_handle) { chroms[cur_path].emplace_back(); chroms[cur_path].back().chromosome_name = cur_path; chroms[cur_path].back().template_position = cur_template_offset; + chroms[cur_path].back().template_feature = cur_template_feature; chroms[cur_path].back().right_primer.left = false; } } diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index a33906a5dd2..edfb9bb7d54 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -53,6 +53,7 @@ struct PrimerPair { Primer left_primer; Primer right_primer; string chromosome_name; + string template_feature; size_t linear_product_size = numeric_limits::max(); size_t template_position = numeric_limits::max(); size_t min_product_size = numeric_limits::max(); diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index eedb2a80c52..a700f55ec75 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -40,7 +40,9 @@ size_t difference(const size_t& a, const size_t& b) { void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { const Primer& left_primer = primer_pair.left_primer; const Primer& right_primer = primer_pair.right_primer; + size_t rln = right_primer.mapped_nodes_ids.size() - 1; //right primer last node index cout << genome_name << "\t" + << primer_pair.template_feature << "\t" << primer_pair.template_position << "\t" << left_primer.sequence << "\t" << right_primer.sequence << "\t" @@ -48,6 +50,8 @@ void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { << right_primer.position_template << "\t" << left_primer.position_chromosome << "\t" << right_primer.position_chromosome << "\t" + << left_primer.mapped_nodes_ids[0] << "\t" + << right_primer.mapped_nodes_ids[rln] << "\t" << left_primer.length << "\t" << right_primer.length << "\t" << primer_pair.linear_product_size << "\t" @@ -55,7 +59,6 @@ void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { << primer_pair.max_product_size << "\t" << primer_pair.no_variation_at_primers << "\t" << primer_pair.no_variation_in_products << endl; - } int main_primers(int argc, char** argv) { @@ -156,8 +159,8 @@ int main_primers(int argc, char** argv) { ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle); - cout << "chrom\ttplpos\tlpseq\trpseq\tlppostpl\trppostmp\tlpposchrom\trpposchrom\t" - << "lplen\trplen\tlinsize\tminsize\tmaxsize\tnvprimers\tnvproducts" << endl; + cout << "chrom\ttplfeat\ttplpos\tlpseq\trpseq\tlppostpl\trppostmp\tlpposchrom\trpposchrom\t" + << "lpnid\trpnid\tlplen\trplen\tlinsize\tminsize\tmaxsize\tnvprimers\tnvproducts" << endl; vector reference_paths = primer_finder.get_reference_paths(); for (size_t i = 0; i < reference_paths.size(); ++i) { @@ -193,4 +196,4 @@ int main_primers(int argc, char** argv) { return 0; } -static Subcommand vg_primers("primers", "filter primers for low variation", main_primers); \ No newline at end of file +static Subcommand vg_primers("primers", "filter primers for low variation", main_primers); diff --git a/test/primers/y.primer3_with_ref_pos.out b/test/primers/y.primer3_with_ref_pos.out index 22991788382..f4fc72bc305 100644 --- a/test/primers/y.primer3_with_ref_pos.out +++ b/test/primers/y.primer3_with_ref_pos.out @@ -1,4 +1,4 @@ -PRIMER PICKING RESULTS FOR y,0 +PRIMER PICKING RESULTS FOR y|gene|feature|0 No mispriming library specified Using 0-based sequence positions diff --git a/test/primers/y.primer3_with_ref_pos_11.out b/test/primers/y.primer3_with_ref_pos_11.out index 8843691fccf..2ac75ef9695 100644 --- a/test/primers/y.primer3_with_ref_pos_11.out +++ b/test/primers/y.primer3_with_ref_pos_11.out @@ -1,4 +1,4 @@ -PRIMER PICKING RESULTS FOR y,11 +PRIMER PICKING RESULTS FOR y|gene|feature|11 No mispriming library specified Using 0-based sequence positions diff --git a/test/primers/y.split.out b/test/primers/y.split.out index 573628f03c5..54c5bdefed6 100644 --- a/test/primers/y.split.out +++ b/test/primers/y.split.out @@ -1,4 +1,4 @@ -PRIMER PICKING RESULTS FOR y,100 +PRIMER PICKING RESULTS FOR y|gene|feature|100 No mispriming library specified Using 0-based sequence positions @@ -65,7 +65,7 @@ considered 61, unacceptable product size 55, ok 6 libprimer3 release 2.6.1 -PRIMER PICKING RESULTS FOR y,601 +PRIMER PICKING RESULTS FOR y|gene|feature|601 No mispriming library specified Using 0-based sequence positions diff --git a/test/t/55_vg_primers.t b/test/t/55_vg_primers.t index 252116f5c80..dcf5ea33014 100644 --- a/test/t/55_vg_primers.t +++ b/test/t/55_vg_primers.t @@ -30,7 +30,7 @@ is $(vg primers primers/y.split.out -x y.xg -s y.dist -m 140 | wc -l) 7 "Get th vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist > y.ref_pos_0.out vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -s y.dist > y.ref_pos_11.out -diff -q <(awk '{$2=$5=$6=""; print $0}' y.ref_pos_0.out) <(awk '{$2=$5=$6=""; print $0}' y.ref_pos_11.out) > diff_0_11 +diff -q <(awk '{$2=$3=$6=$7=""; print $0}' y.ref_pos_0.out) <(awk '{$2=$3=$6=$7=""; print $0}' y.ref_pos_11.out) > diff_0_11 is $(cat diff_0_11 | wc -l) 0 "These two output files should have identical primers except for their positions on template" # clean up From 9c364fe5b2fee64e7fb1639a8414a6b3b2931355 Mon Sep 17 00:00:00 2001 From: AlanZhangUCSC Date: Sun, 29 Oct 2023 17:13:05 -0700 Subject: [PATCH 24/43] fix bug --- src/primer_filter.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index f24849818eb..596fc811b83 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -181,21 +181,29 @@ void PrimerFinder::map_to_nodes(Primer& primer, const string& path_name) { step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, primer.position_chromosome); handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + size_t cur_node_length = graph->get_length(cur_node_handle); + size_t cur_node_position = graph->get_position_of_step(cur_node_step_handle); + size_t cur_offset = primer.position_chromosome - cur_node_position; primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); - string cur_node_sequence = graph->get_sequence(cur_node_handle); - // Get the index at which primer.sequence[0:index] maps to the first node. - // Stop here if the first node contains the entire primer sequence - size_t primer_matched_index = longest_match_len(primer, cur_node_sequence, primer_seq, true) - 1; - // If the first node containly a prefix of primer sequence, walk down the path and keep adding - // node until the entire primer sequence is covered - while (primer_matched_index < primer_seq.size()-1) { + if (primer.left) { + primer.offset = cur_offset; + } + size_t matched_length = 0; + while (cur_node_length - cur_offset < primer.length - matched_length) { + assert(graph->get_sequence(cur_node_handle).substr(cur_offset, cur_node_length - cur_offset) + == primer_seq.substr(matched_length, cur_node_length - cur_offset)); + matched_length += cur_node_length - cur_offset; + cur_offset = 0; cur_node_step_handle = graph->get_next_step(cur_node_step_handle); cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); + cur_node_length = graph->get_length(cur_node_handle); primer.mapped_nodes_ids.push_back(graph->get_id(cur_node_handle)); - cur_node_sequence = graph->get_sequence(cur_node_handle); - string primer_substr = primer_seq.substr(primer_matched_index + 1, primer.length - primer_matched_index - 1); - primer_matched_index += longest_match_len(primer, primer_substr, cur_node_sequence, false); } + assert(graph->get_sequence(cur_node_handle).substr(cur_offset, primer.length - matched_length) + == primer_seq.substr(matched_length, primer.length - matched_length)); + if (!primer.left) { + primer.offset = cur_offset + primer.length - matched_length; + } } size_t PrimerFinder::longest_match_len(Primer& primer, const string& left_seq, From a49b43af950c2143802c63c07810eece818a5ccf Mon Sep 17 00:00:00 2001 From: AlanZhangUCSC Date: Tue, 14 Nov 2023 11:59:37 -0800 Subject: [PATCH 25/43] up --- src/primer_filter.cpp | 180 +++++++++++++++++++++++--------- src/primer_filter.hpp | 16 ++- src/subcommand/primers_main.cpp | 93 +++++++++++------ src/unittest/primer_filter.cpp | 41 ++++---- test/primers/index.dist | Bin 0 -> 3008 bytes test/primers/index.min | Bin 0 -> 32856 bytes test/primers/y.gbwt | Bin 0 -> 1912 bytes test/primers/y.gg | Bin 0 -> 2277 bytes test/primers/y.giraffe.gbz | Bin 0 -> 2800 bytes test/primers/y.ri | Bin 0 -> 905 bytes test/t/55_vg_primers.t | 32 +++--- 11 files changed, 242 insertions(+), 120 deletions(-) create mode 100644 test/primers/index.dist create mode 100644 test/primers/index.min create mode 100644 test/primers/y.gbwt create mode 100644 test/primers/y.gg create mode 100644 test/primers/y.giraffe.gbz create mode 100644 test/primers/y.ri diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 596fc811b83..7a3290ec4f3 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -7,9 +7,15 @@ using namespace std; // Constructor PrimerFinder::PrimerFinder(const unique_ptr& graph_param, - const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle) { - graph = graph_param.get(); - distance_index = distance_index_param; + const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle, + const gbwtgraph::GBWTGraph& gbwt_graph_param, const gbwt::GBWT& gbwt_index_param, + const gbwt::FastLocate& r_index_param) + : graph(graph_param.get()), + distance_index(distance_index_param), + gbwt_graph(gbwt_graph_param), + gbwt_index(gbwt_index_param), + r_index(r_index_param) { + load_primers(primers_file_handle); } @@ -41,8 +47,8 @@ void PrimerFinder::add_primer_pair(const string& path_name, right_primer_offset, right_primer_length, false); primer_pair.linear_product_size = primer_pair.right_primer.position_template - primer_pair.left_primer.position_template + primer_pair.right_primer.length; - update_min_max_product_size(primer_pair); update_variation(primer_pair, path_name); + update_min_max_product_size(primer_pair); } void PrimerFinder::load_primers(ifstream& file_handle) { @@ -97,8 +103,8 @@ void PrimerFinder::load_primers(ifstream& file_handle) { map_to_nodes(primer_pair.right_primer, cur_path); primer_pair.linear_product_size = primer_pair.right_primer.position_template - primer_pair.left_primer.position_template + primer_pair.right_primer.length; - update_min_max_product_size(primer_pair); update_variation(primer_pair, cur_path); + update_min_max_product_size(primer_pair); chroms[cur_path].emplace_back(); chroms[cur_path].back().chromosome_name = cur_path; chroms[cur_path].back().template_position = cur_template_offset; @@ -157,17 +163,67 @@ void PrimerFinder::make_primer(Primer& primer, const string& path_name, map_to_nodes(primer, path_name); // Search and store corresponding nodes ids } -void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { - const Primer& left_primer = primer_pair.left_primer; - const Primer& right_primer = primer_pair.right_primer; +static string get_haplotype_sequence(gbwt::size_type sequence_visit_offset, handle_t start_handle, + handle_t end_handle, const gbwtgraph::GBWTGraph& gbwt_graph, size_t start_max, size_t end_max) { + + string haplotype; + gbwt::edge_type pos = gbwt::edge_type(gbwtgraph::GBWTGraph::handle_to_node(start_handle), sequence_visit_offset); + + if (pos == gbwt::invalid_edge() || pos.first == gbwt::ENDMARKER) { + return haplotype; + } - primer_pair.min_product_size = distance_index->minimum_distance(left_primer.mapped_nodes_ids[0], - false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], - false, right_primer.offset); + handle_t curr = gbwt_graph.node_to_handle(pos.first); + if (curr == end_handle) { + return haplotype; + } + gbwtgraph::view_type view = gbwt_graph.get_sequence_view(curr); + size_t offset = (view.second > start_max ? view.second - start_max : 0); + haplotype.append(view.first + offset, view.second - offset); + + while (true) { + pos = gbwt_graph.index->LF(pos); + if (pos.first == gbwt::ENDMARKER) { + break; + } + curr = gbwtgraph::GBWTGraph::node_to_handle(pos.first); + view = gbwt_graph.get_sequence_view(curr); + if (curr == end_handle) { + haplotype.append(view.first, std::min(view.second, end_max)); + break; + } else { + haplotype.append(view.first, view.second); + } + } + return haplotype; +} - primer_pair.max_product_size = distance_index->maximum_distance(left_primer.mapped_nodes_ids[0], - false, left_primer.offset, right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1], - false, right_primer.offset); +void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { + const auto& sequence_visits = primer_pair.sequence_visits; + + handle_t start_handle = gbwt_graph.get_handle(primer_pair.left_primer.mapped_nodes_ids.front()); + handle_t end_handle = gbwt_graph.get_handle(primer_pair.right_primer.mapped_nodes_ids.back()); + if (start_handle == end_handle) { + primer_pair.min_product_size = primer_pair.linear_product_size; + primer_pair.max_product_size = primer_pair.linear_product_size; + return; + } + + size_t start_max = gbwt_graph.get_length(start_handle) - primer_pair.left_primer.offset; + size_t end_max = primer_pair.right_primer.offset; + size_t minimum_distance = numeric_limits::max(); + size_t maximum_distance = 0; + for (const auto& visit : sequence_visits) { + string haplotype = get_haplotype_sequence(visit.second, start_handle, end_handle, gbwt_graph, start_max, end_max); + if (haplotype.size() < minimum_distance) { + minimum_distance = haplotype.size(); + } + if (haplotype.size() > maximum_distance) { + maximum_distance = haplotype.size(); + } + } + primer_pair.min_product_size = minimum_distance; + primer_pair.max_product_size = maximum_distance; } void PrimerFinder::map_to_nodes(Primer& primer, const string& path_name) { @@ -253,43 +309,73 @@ const string PrimerFinder::strip(const string& s) const { return s.substr(start, end+1); } -void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_name) { - const Primer& left_primer = primer_pair.left_primer; - const Primer& right_primer = primer_pair.right_primer; - - nid_t right_edge_node_id = right_primer.mapped_nodes_ids[right_primer.mapped_nodes_ids.size()-1]; - unordered_set primer_nodes_set; - for (size_t i = 0; i < left_primer.mapped_nodes_ids.size(); ++i) { - primer_nodes_set.insert(left_primer.mapped_nodes_ids[i]); +vector get_sequence_visits(const handle_t& handle, + const gbwt::FastLocate& r_index, const gbwtgraph::GBWTGraph& gbwt_graph) { + + vector sa = r_index.decompressSA(gbwt_graph.handle_to_node(handle)); + vector result; + result.reserve(sa.size()); + for (size_t i = 0; i < sa.size(); i++) { + result.push_back({ sa[i], i }); } - for (size_t i = 0; i < right_primer.mapped_nodes_ids.size(); ++i) { - primer_nodes_set.insert(right_primer.mapped_nodes_ids[i]); + std::sort(result.begin(), result.end(), [&](HaplotypePartitioner::sequence_type a, HaplotypePartitioner::sequence_type b) -> bool { + gbwt::size_type a_id = r_index.seqId(a.first); + gbwt::size_type a_offset = r_index.seqOffset(a.first); + gbwt::size_type b_id = r_index.seqId(b.first); + gbwt::size_type b_offset = r_index.seqOffset(b.first); + return ((a_id < b_id) || ((a_id == b_id) && (a_offset > b_offset))); + }); + return result; +} + +static void sa_to_da(std::vector& sequences, const gbwt::FastLocate& r_index) { + for (auto& sequence : sequences) { + sequence.first = r_index.seqId(sequence.first); } - - const path_handle_t& reference_path_handle = graph->get_path_handle(path_name); - step_handle_t cur_node_step_handle = graph->get_step_at_position(reference_path_handle, left_primer.position_template); - handle_t cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); - net_handle_t cur_net_handle = distance_index->get_net(cur_node_handle, graph); - nid_t cur_node_id = graph->get_id(cur_node_handle); - while (true) { - size_t depth = distance_index->get_depth(cur_net_handle); - if (depth != 1) { - if (primer_nodes_set.find(cur_node_id) != primer_nodes_set.end()) { - primer_pair.no_variation_at_primers = false; - primer_pair.no_variation_in_products = false; - break; - } else { - primer_pair.no_variation_in_products = false; - } - } - if (cur_node_id == right_edge_node_id) { - break; +} + +void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_name) { + const vector& left_primer_node_ids = primer_pair.left_primer.mapped_nodes_ids; + const vector& right_primer_node_ids = primer_pair.right_primer.mapped_nodes_ids; + vector nodes_id; + merge(left_primer_node_ids.begin(), left_primer_node_ids.end(), + right_primer_node_ids.begin(), right_primer_node_ids.end(), back_inserter(nodes_id)); + handle_t cur_handle = gbwt_graph.get_handle(nodes_id[0]); + auto sequence_visits = get_sequence_visits(cur_handle, r_index, gbwt_graph); + sa_to_da(sequence_visits, r_index); + + for (size_t i = 1; i < nodes_id.size(); i++) { + cur_handle = gbwt_graph.get_handle(nodes_id[i]); + auto cur_sequence_visits = get_sequence_visits(cur_handle, r_index, gbwt_graph); + sa_to_da(cur_sequence_visits, r_index); + unordered_set seq_ids; + for (const auto& seq_visit : cur_sequence_visits) { + seq_ids.insert(seq_visit.first); } - cur_node_step_handle = graph->get_next_step(cur_node_step_handle); - cur_node_handle = graph->get_handle_of_step(cur_node_step_handle); - cur_net_handle = distance_index->get_net(cur_node_handle, graph); - cur_node_id = graph->get_id(cur_node_handle); + + sequence_visits.erase( + remove_if( + sequence_visits.begin(), + sequence_visits.end(), + [&seq_ids](const HaplotypePartitioner::sequence_type& visit) { + return seq_ids.find(visit.first) == seq_ids.end(); + } + ), + sequence_visits.end() + ); } + + auto unique_haplotypes = sequence_visits; + auto it = unique(unique_haplotypes.begin(), unique_haplotypes.end(), [this](const auto& a, const auto& b) { + const gbwt::PathName& path_name_a = this->gbwt_graph.index->metadata.path(gbwt::Path::id(a.first)); + const gbwt::PathName& path_name_b = this->gbwt_graph.index->metadata.path(gbwt::Path::id(b.first)); + return (path_name_a.sample == path_name_b.sample) && (path_name_a.phase == path_name_b.phase); + }); + unique_haplotypes.erase(it, unique_haplotypes.end()); + + primer_pair.sequence_visits = sequence_visits; + primer_pair.variation_level = static_cast(unique_haplotypes.size()) / static_cast(gbwt_graph.index->metadata.haplotypes()); + } vector PrimerFinder::split(const string& str) { diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index edfb9bb7d54..fa2fec23ec2 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -23,6 +23,8 @@ #include "traversal_finder.hpp" #include #include +#include "../primer_filter.hpp" +#include "../recombinator.hpp" using namespace std; @@ -58,18 +60,20 @@ struct PrimerPair { size_t template_position = numeric_limits::max(); size_t min_product_size = numeric_limits::max(); size_t max_product_size = numeric_limits::max(); - bool no_variation_at_primers = true; - bool no_variation_in_products = true; + double variation_level = 0.0; + vector sequence_visits; }; class PrimerFinder { private: - // vector primer_pairs; - // vector selected_primer_pairs; unordered_map> chroms; // map containing a vector of primer pairs for each chromosome const PathPositionHandleGraph* graph; const SnarlDistanceIndex* distance_index; + const gbwtgraph::GBWTGraph& gbwt_graph; + const gbwt::GBWT& gbwt_index; + const gbwt::FastLocate& r_index; + public: PrimerFinder() = default; @@ -79,7 +83,9 @@ class PrimerFinder { * and pointer to SnarlDistanceIndex */ PrimerFinder(const unique_ptr& graph_param, - const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle); + const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle, + const gbwtgraph::GBWTGraph& gbwt_graph, const gbwt::GBWT& gbwt_index, + const gbwt::FastLocate& r_index); /** * Destructor diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index a700f55ec75..d651e272104 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -16,13 +16,15 @@ void help_primers(char** argv) { cerr << "usage: " << argv[0] << " primers [options] input.primer3 > filtered_primers.out" << endl << endl << "options:" << endl - << " -x, --xg-path FILE use this xg graph" << endl - << " -s, --snarl-index FILE use this snarl index" << endl - << " -z, --zero-variation allow no variance in the product" << endl - << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl - << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl - << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl - << " -a, --all-primers output all primers" << endl; + << " -x, --xg-path FILE use this xg graph" << endl + << " -s, --snarl-index FILE use this snarl index" << endl + << " -r, --r-index FILE use this r index" << endl + << " -g, --gbz FILE use this gbz file" << endl + << " -v, --variation-threshold DOUBLE output primers that work for at least this percentage of haplotypes (default: 0.8)" + << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl + << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl + << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl + << " -a, --all-primers output all primers" << endl; } size_t difference(const size_t& a, const size_t& b) { @@ -57,8 +59,7 @@ void print_tabular(const string& genome_name, const PrimerPair& primer_pair) { << primer_pair.linear_product_size << "\t" << primer_pair.min_product_size << "\t" << primer_pair.max_product_size << "\t" - << primer_pair.no_variation_at_primers << "\t" - << primer_pair.no_variation_in_products << endl; + << primer_pair.variation_level << endl; } int main_primers(int argc, char** argv) { @@ -70,9 +71,12 @@ int main_primers(int argc, char** argv) { string xg_path; string snarl_index_path; + string ri_path; + string gbz_path; bool zero_variation = false; bool all_primers = false; int tolerance = 10; + double variation_threshold = 0.8; int minimum_product_size = numeric_limits::max(); int maximum_product_size = numeric_limits::max(); @@ -82,19 +86,21 @@ int main_primers(int argc, char** argv) { while (true) { static struct option long_options[] = { - {"help", no_argument, 0, 'h'}, - {"xg-path", required_argument, 0, 'x'}, - {"snarl-index", required_argument, 0, 's'}, - {"zero-variance", required_argument, 0, 'z'}, - {"tolerance", required_argument, 0, 'l'}, - {"minimum-size", required_argument, 0, 'n'}, - {"maximum-size", required_argument, 0, 'm'}, - {"all-primers", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {"xg-path", required_argument, 0, 'x'}, + {"snarl-index", required_argument, 0, 's'}, + {"ri-path", required_argument, 0, 'r'}, + {"gbz-path", required_argument, 0, 'g'}, + {"variation-threshold", required_argument, 0, 'v'}, + {"tolerance", required_argument, 0, 'l'}, + {"minimum-size", required_argument, 0, 'n'}, + {"maximum-size", required_argument, 0, 'm'}, + {"all-primers", required_argument, 0, 'a'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hx:s:zl:n:m:a", long_options, &option_index); + c = getopt_long (argc, argv, "hx:s:r:g:l:n:m:a", long_options, &option_index); // Detect the end of the options. if (c == -1) break; @@ -108,11 +114,19 @@ int main_primers(int argc, char** argv) { case 's': snarl_index_path = optarg; break; - - case 'z': - zero_variation = true; + + case 'r': + ri_path = optarg; + break; + + case 'g': + gbz_path = optarg; break; + case 'v': + variation_threshold = parse(optarg); + break; + case 'l': tolerance = parse(optarg); break; @@ -146,7 +160,17 @@ int main_primers(int argc, char** argv) { } if (snarl_index_path.empty()) { - cerr << "error:[vg priemrs] snarl index file (-s) is required" << endl; + cerr << "error:[vg primers] snarl index file (-s) is required" << endl; + exit(1); + } + + if (ri_path.empty()) { + cerr << "error:[vg primers] r index file (-r) is required" << endl; + exit(1); + } + + if (gbz_path.empty()) { + cerr << "error:[vg primers] gbz file (-g) is required" << endl; exit(1); } @@ -154,13 +178,20 @@ int main_primers(int argc, char** argv) { SnarlDistanceIndex distance_index; unique_ptr graph; + gbwtgraph::GBWTGraph gbwt_graph; + gbwt::GBWT gbwt_index; + gbwt::FastLocate r_index; + load_r_index(r_index, ri_path); + load_gbz(gbwt_index, gbwt_graph, gbz_path); + gbwt_graph.set_gbwt(gbwt_index); + r_index.setGBWT(gbwt_index); distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_path); ifstream file_handle(primers_path); - PrimerFinder primer_finder(graph, &distance_index, file_handle); + PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index); cout << "chrom\ttplfeat\ttplpos\tlpseq\trpseq\tlppostpl\trppostmp\tlpposchrom\trpposchrom\t" - << "lpnid\trpnid\tlplen\trplen\tlinsize\tminsize\tmaxsize\tnvprimers\tnvproducts" << endl; + << "lpnid\trpnid\tlplen\trplen\tlinsize\tminsize\tmaxsize\tvarlevel" << endl; vector reference_paths = primer_finder.get_reference_paths(); for (size_t i = 0; i < reference_paths.size(); ++i) { @@ -170,11 +201,7 @@ int main_primers(int argc, char** argv) { const PrimerPair& primer_pair = primer_pairs[j]; if (all_primers) { print_tabular(path_name, primer_pair); - } else if (zero_variation) { - if (primer_pair.no_variation_in_products) { - print_tabular(path_name, primer_pair); - } - } else if (primer_pair.no_variation_at_primers) { + } else { if (minimum_product_size != numeric_limits::max() && primer_pair.min_product_size < minimum_product_size) { continue; @@ -183,11 +210,15 @@ int main_primers(int argc, char** argv) { primer_pair.max_product_size > maximum_product_size) { continue; } + if (difference(primer_pair.linear_product_size, primer_pair.min_product_size) > tolerance - || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { - continue; + || difference(primer_pair.linear_product_size, primer_pair.max_product_size) > tolerance) { + continue; } + if (primer_pair.variation_level < variation_threshold) { + continue; + } print_tabular(path_name, primer_pair); } } diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 3a38aebbe29..db7d5620686 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -24,26 +24,35 @@ #include #include "xg.hpp" #include "../primer_filter.hpp" +#include "../recombinator.hpp" namespace vg { namespace unittest { using namespace std; + TEST_CASE( "filter simple primers", "[primer_filter]" ) { SnarlDistanceIndex distance_index; unique_ptr graph; + gbwtgraph::GBWTGraph gbwt_graph; + gbwt::GBWT gbwt_index; + gbwt::FastLocate r_index; string snarl_index_path = "test/primers/y.dist"; string xg_graph_path = "test/primers/y.xg"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); + load_r_index(r_index, "test/primers/y.ri"); + load_gbz(gbwt_index, gbwt_graph, "test/primers/y.giraffe.gbz"); + gbwt_graph.set_gbwt(gbwt_index); + r_index.setGBWT(gbwt_index); SECTION("template_position=0") { string primers_path = "test/primers/y.primer3_with_ref_pos.out"; ifstream file_handle(primers_path); - PrimerFinder primer_finder(graph, &distance_index, file_handle); + PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index); SECTION("Loads the correct number of chromosomes") { REQUIRE(primer_finder.total_reference_paths() == 1); @@ -95,24 +104,21 @@ using namespace std; }; const vector min_product_sizes { - 124, 142, 144, 137, 135, 97, 112 + 124, 142, 144, 137, 136, 99, 112 }; const vector max_product_sizes { - 124, 145, 147, 140, 138, 100, 112 + 124, 145, 147, 140, 137, 99, 112 }; const vector linear_product_sizes { 124, 145, 147, 140, 136, 99, 112 }; - const vector no_variations_at_primers { - true, true, true, true, true, false, true + const vector variation_level { + 1.0, 1.0, 1.0, 1.0, 1.0, 0.33333, 1.0 }; - const vector no_variations_in_products { - false, false, false, false, false, false, true - }; const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); @@ -129,8 +135,7 @@ using namespace std; REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); - REQUIRE(no_variations_at_primers[i] == primer_pairs[i].no_variation_at_primers); - REQUIRE(no_variations_in_products[i] == primer_pairs[i].no_variation_in_products); + REQUIRE(abs(variation_level[i] - primer_pairs[i].variation_level) <= 0.0001); } SECTION("Check that primers are assigned with correct nodes") { @@ -161,7 +166,7 @@ using namespace std; SECTION("template_position=11") { string primers_path = "test/primers/y.primer3_with_ref_pos_11.out"; ifstream file_handle(primers_path); - PrimerFinder primer_finder(graph, &distance_index, file_handle); + PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index); SECTION("Loads the correct number of chromosomes") { REQUIRE(primer_finder.total_reference_paths() == 1); @@ -213,24 +218,21 @@ using namespace std; }; const vector min_product_sizes { - 124, 142, 144, 137, 135, 97, 112 + 124, 142, 144, 137, 136, 99, 112 }; const vector max_product_sizes { - 124, 145, 147, 140, 138, 100, 112 + 124, 145, 147, 140, 137, 99, 112 }; const vector linear_product_sizes { 124, 145, 147, 140, 136, 99, 112 }; - const vector no_variations_at_primers { - true, true, true, true, true, false, true + const vector variation_level { + 1.0, 1.0, 1.0, 1.0, 1.0, 0.33333, 1.0 }; - const vector no_variations_in_products { - false, false, false, false, false, false, true - }; const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); @@ -247,8 +249,7 @@ using namespace std; REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); - REQUIRE(no_variations_at_primers[i] == primer_pairs[i].no_variation_at_primers); - REQUIRE(no_variations_in_products[i] == primer_pairs[i].no_variation_in_products); + REQUIRE(abs(variation_level[i] - primer_pairs[i].variation_level) <= 0.0001); } SECTION("Check that primers are assigned with correct nodes") { diff --git a/test/primers/index.dist b/test/primers/index.dist new file mode 100644 index 0000000000000000000000000000000000000000..0deabd8490c7fb0d03202bfca48785a5f158fd16 GIT binary patch literal 3008 zcmZ{l&1+m`6vm%>lQe0Xrjxd5qK$Plp#+K$wMcbg?`5hJ7h)Cz7Tg38gl-J9D~WW_ z6h%~K;UbEQ1YEcf?7}}_tl~-`Ai5HXJ8@&6e}Mj;^Uj^jXng16eb0N&dCqe_=FaQ8 zo8NboR@N-bvejy}E4k4^EL&F&6HdiKo{2U9kyW~pk{-;j0}7<({x{QU1A;| zJNrc_oEp9ZlTW(l=%gH%H#;yS1q*eDWkUJjG$>8t2P z^rn|xU^=x_k_740U14@EfaMfB!LT)Jf&?;(zBZyJe%vOG4h?#6AUICE?QFG^_XQ^# zIGD>FBna!OT6R0VQ5t}#lNpTibE`hpZ%wN|oqzkbRLR*3N!}js{aACm)JIh$c{$g9 zU2TNv&>(n?<(iPz>gpog&om{iJRD{S03dk%nb~aTLYV!Q`D*Q-!;9?R^2LJ-=U?zNKBJJ9YuQVBE`=>&<6m?r)sswg~g6$W= zBb*Fq1WJX_)1W_?3g~Y+O5GD7+zp{K8Nw&w(>WeK2wvZ0#bg{lIHkR3<8(*vPy3xP zi<|0eS}K;cGZ73(XS7myKbxzQsauzxMuRY>Hc#X+e6E=I)}zwZfL;SVCz(kxE@;~4 zq*n%^-eRO^37!P0O*_u+gbE4-;Mve+VRf zhR!P2yJ|xR;g3vv)hruiZCnN6Zc5(~8i8vwmdMx1WMaY2Zfy3j^n%d`eSe{Etw!gFB_v0e@G zsKFi^deZiPajJp;G{>~=SkE> z5v> zsiHY%C$#a{0It!<=Tb}+4(>vKu3Z4Q(SaRY`g2`dQE_hoqswf~{&%kHi2)cxQ+0hx z2agYUQ}}`)|DLKMwGldK(o}J!#Swi?cKx*})k&7jhH5~8u-cE|v}TCGHmWE42+UIE zYB|nOfAFDnU7(NX&)yU2{y2cqrHDKQuJN#AkZ^d3sX{oSKUhLwt85PwF~(O^RHOzH3l;8FACgf+FkKcT$n+(P`%p()~~_f_!cMCKo7 yE}}-qMY3OvMB0`KnrVj=p3kje4Fl||`=PAC@-Qf7>(F7l5s&qaMd@9nwEh8fqwNU* literal 0 HcmV?d00001 diff --git a/test/primers/index.min b/test/primers/index.min new file mode 100644 index 0000000000000000000000000000000000000000..67273dc2214ddba5ccbddc9b002c1e03d683315b GIT binary patch literal 32856 zcmeGl3s6+o^?nvv*yX!?BzyxHP>En9MTN))%(w{}70sATLe#c1?WApnRvi4nqGcxh zgnZ1{v>J$?fvPb~Bq=pA!J?y4ql5E1Tml}RQv$F_de|1p?Q6cQKXR6zCI$W_Q zGX8^rr2ZK1_$v+n+4SeD^w%A+_5dR<)6d0}zHLmuPI&v{%ev^Swq_UXQ}N^Yrx^>j zMIS$kJn!>zHvi}ueX!E}F)f;x^SLPwO^)RHCz0NkleaOL{|3J3*o!yiAV)31hrFo$ z$>wK2ax)I>&*rU$h?KjL*dt^ZeNHg?Nb;vE{M`II3*vxt9>re3e>C&gwglak;4}=? z7S>l1=X3g)`-huyJU`jgxe{o-hK*nP^S?mwjPX`W?>nhTvx$!n4|?lH$A^6T<*Aw( z`Rb%~*zsH5{#e>a{l<>?-F0(;%0l}TT<(l&k0?T(BKBX+2K?xhyJ5h=FpR!YME>qs z-+q-lex>T`y3-SmFPjCmV*Y{9fT;a1)xN!)Q9zV_Q*x5~x-L#ff#H1ou=#yx{wi7C zR#L4$j2(Ay`P|KZO62op`QulLKbi2^9+$*dfI|cCpXvW%rvH)i8(qDc!EH#b9;SbO zBKx6P|4~crqu$6xWc*fSpnG!;CIFr3$Klrp|4x7JWHs;;(XUeUp_AjHLs8Jt!ub*J zi$8Fx{sIV`&EaGE{be@)N%}X*zE$&0^ZGGz=kmks)9=YM(f{*K>82es`2io1asm8K zh(+K(6xF%j_s#iOC&GUz`b2sb! zr093Ar?e%rU?p}Fp-%0R+iWnp|(ZvWPJK=*QZJG(`MCOmFS1$CH8}6V0 zE5%2o;8(gl%JH)`2b-=zHw*pOsh=92DG&8V{uav5RUv?f-N$MC&R9@I!fT$ub%M%I z=k3nL*OOOZhm%}Cu=$w=+H62SIQ{T__08HapcS!?4;40at<3a9eoHv~kI~y-F#RTJ zpDI26lx|-b-px>B;rNzLX#>v|C#)baQ#ea z_>^w{=*npsFMO4SyzJ|LWYGWbOKJKm-TvOB6;C!tYz3}X;%~~v7KiZqIN)lbfBpL# zY=s(tc0N9jP&Tb5*9g`}nf(zWZhr{&-91tJF1~I~J*o)?F5}exkSkxs?b^&+H&UIoWewW<#<<@86mp^BMAR(y2*7w-_oy_KM+3}S~<^cT_pxIphnE#VY zp5vwculFlw@4K|Z8F-5Luc@CIo1cx$K%Nh&{UOEwOC_83XHGeYe3x_jkEMO~QA@fZ ze?SdYBJouz`0VA30;2JMTc=ObI+nqQ0z}5Awc%fsbU%N^xwnw|f?%6aw~DP#ygo7F z$5i=|5EGm@_GCiGHclTV|Emf9V~~H@b!Yn9 zxhG~Kx0A!#AEW#aYwHhN@BcX=fP|ey;z!IoJ#WBTrQ~n!fx4TO@iR z$su>cjsd8U|H7i9Nm&5ADZm#%WP3W5Z~FKMko2!hW6x)|?hJx1CXOGokJgj=HV{9s zI{yU#ME$3dqSG%VEbYaPBKbScJAYodJi{5m*_^(kiBH%%{spSAwheo}gK!nckNN); z$rJ!ccr%d57}6+YAJ{s5!4Sdk;mMJu<1Cfh0Z47*d-&pz&a^bUfeN+*#%Y9l0_7=&n zS{qWQy&IZJ@~=7ln0+(C{Ph_ty5~18?uS4`e@WqgTC>?${Z0aQ{S)usJU_H?&7N&* zno5!CTV8I?U)Y~q+!xl0P%PIUOg~PL{FS!gF1kenPZFU|^t#%*yo6=g%SP8J9P8ADbUAC@XO^d%nr`Zs(%s$^l|)| ze?AmHmfOCy>r>;NIFJZ@&E@MHMV0!GWc@$xetLcQu~Oh8vOW@;TITh|`~(mj%jv`H z*Nx16#bo=*C3N*HQ0?I5=JtiP@fUWt#UQNE{x)mlC%6AfVS^>~u`qrc6w$IZFZ>{M zisJIc?8Bk>*NNQN!Sd}G`K9x6bNyC4^D{%|z*-!dl>EFpjE6UUFO z?-Y~u$`PCoraFJW^i|hLHf!i`_C`}*ER(S=>W!AAEdC`})G`-}pj^S@mBq&hxvrYt%Xx_WW>8%=(P-2UzLdllF({>S zK-~@}iP*2t-&{yZD0)8gI)Q<}L4<$#`!nehB1HD}TR{May-V@W*2{C9XJ(KHA=KLA zgTpv0JlZ<{l!ov2uG?Pj34elNbdc6d9j)Bk2x2U(xez<@TQ!k{kDaNHKyv zoImWSmzM}X$KZTPx$xQRGYTl?|4Y#~Z}%=5fx<2#@h9PUfL1(7#D7byaZBsp837)t z}q@i!^_PtXI>(iUNOgTrq=K3dmrX=v4VBeg>Kk^GPKUb7=1GV%T&ru~zt;p_x| z>}93(S1I`@F|vOc1DY5<|FQM&q5R^DTM`oo`i*F^NV!z|QsrIkr3YS}TZ>&R%#V<2 z|6wop-nnJDI?e>$WBB-E@r9xNV~3lf?=O4qID{7N4`J3%c^~u;v45@v(|-U+2dY(E zzD5&YP?~?YoOd{SQuZCYG3Jc;qM{uVzSkq3BZ5j zwg#kM3U1|BScKht# ze`n+a-M3tR>L?9Xlgs}9k)ZeX4{=M%{|3Ns1;L=>*F3&?PB0*Vn?m-1wdF%teu@NI zh1Q?Q-mUbx^$2<&;Pf@$pXk(!4S^?Pu)j!rRf;~#g#bT<3wis@zI#aiKTC=}a?8o~ z^n!$CanRjL^T(1SC+HD=P*=_M7n}c15q{hE$4H*3tc$2(T025O*@nsUk{E6oBm%}}$n`h;4_Ky!&0>`(4l47aOJ;rqY#C86a4#+Oi|{+L|iU zagpOR$2pEAepoh650m5>=g11>e|2`Dk8oXGf>EcM1Ue&yn--Mmt#*Yh@`-lo6tw#$!s zTV0;_&hOjwW9(v{=ljvEjaby>{)?op{Jxw}eYKnTIOzS}_DgpBNl)!0d{)2Oo%1}c z5{X~kE?(v|M!i7N!Jkf3hP}}lAfk-PQih#Q=aP#%6XY7DGr@k`+18zhdmqRL2apSq zRYV`eFZaVoGeO_{d(G0bxG zYILS7N7hD1xmJ;Pf(fdSj5a3;VEp4;>S$8~>WHcVXNan`?t0v(!8vdi`Mi}IFq_C| z1WjZ#f)?BtaMP|8cQd#IE+St>jws!R*+D=%lXM}syP3PuJHa(@1^s$Zh00g4`-trU zeH>B`=*Qn{%iYB7VGi1+H!yDzF+k=J@&I=)IGH;(`Zn$nlsJcT7l|W?Cr?7(Ct(y! z(q@#1Y4&4an*BK1EX5|08G4?y8L$B6z!G?X`=NC&Vy-a4GHut8g7(XH4E-2$mEKR# zRm7~i-~JTV>vVapsk3e;ddXft3oayR9zjxqRc%R3CC|IOck+I_FeUG|*kP9Mjw`R^ zJ)~`Ni5<=pTk<46l`VaQvt0qn`z8fbu;fX6*p_dC=yD$Up*{oYcEV<-uMIU!1&Yiq zdD2I|yP~HUkNa$waDo*mQdd~&NSxR%eGf^V#HX_5JWs_(_*yS4B|p+slm8F$T*&hy r_3|oLhTHy{Q}k4Q#4j9<6MfGQaf0|$vE+%L_a(W+PQ~H8|Hb_QAR=ty literal 0 HcmV?d00001 diff --git a/test/primers/y.gg b/test/primers/y.gg new file mode 100644 index 0000000000000000000000000000000000000000..d7330a802c5028faaa3ccd49e2e5768ab94256b0 GIT binary patch literal 2277 zcmYk8Uuc$P7{VeLo|3M_4d z2y_Dp5+Zl7K#3^`j_9ChfgMCxOAyq=2oFTTKyWvy0terU{Saqt(Ij1b!bw#?VB#}ZU zL9$Zm4uB4jo{`+PTpUq??9MY-YDQ2R+o{5qEzMs z3HYMArlM1MR(Dt$l{0H$vw${HiUj48BebM2SM)(7 zlHw6({N_laH8n;1S}L}dz6zoFb)~z-9WYf!heWjt_y)kR%1kZSQ!YRVl6QmFLT%hz9Wi>1=%z6kguesV;OoV2BQ;$YZwNE-3( z#YRcGgI^wEUW_!plzPFMLbsgh9LQJ+GI7w-Ej4WIgB106WKx)ICDPLrHK`L6s=Py? zQ3DSEj^w%mqG%V60*1`y*|gm$LE}k;oFZozxkc%yQ(QC^93=)Tg4JwB>dLEWHT$$5 zXiyBqxGWTBYf`gN14005pyG{ws#HOw$yOZI9BZJBdOZaxV$(B{)?Ty5gIuSg!{jMn zyYv{-*YYjYji_NFJ*_z*4=bpm7ezrw9urxvZA|Y&Z8SbvLWmlMh_M|)X6iuWsTL9k zS^{!y%Cnf%VAvzN0r#2_sy}`WT$EZDIvGl(sncZr6Fj{U^g5J8vRXA-J|%%CG5Mr$ zNYhpt%UNkL7`e2N0059!5d@a^nf$atfIJD@7KL#j5)cwRD-7t1nS$gFB1aJ-*l@%b zfmuhhC_YKTlr$qq(~(L;8Ywnw|1ii}EMX*20HVRm#giD6t5;%lnI|+D%tXT-HAs)Q zIn#KJY*1)HnVK3J@qo8awb>dODh|Gi{qd?*fvA{8hz<=&%fd>Ci1-dTDoQT!WmH7k z^GX}J7DOup$&xcF(GspQU@XP97%<78%nK}eT=30Om|CMgauw@)qz+L#?WNxG_kLrm z)z!D$+EDHIa`Vjex@{9(o2Fl1H}uw{PsXozpSt)%@5@IYZJg-d+W%eu?H$L@oqTV! znqS@X*YMS2&wMzt?d+Ccz8-z${AXugI{4z`waJOT6Q{o)8r?ZR_UZedpB;Gc`@73M zgMAKfZh5w|^flEe|Xno;|U8^MwyC ztsGiB^5@?Z&%So;=b5Q*FI?GlZ~CY48-vdc|9Ifd`}2RS-kn;y{LdG&N1yxy^I!Av IpBg{^127Msm;e9( literal 0 HcmV?d00001 diff --git a/test/primers/y.giraffe.gbz b/test/primers/y.giraffe.gbz new file mode 100644 index 0000000000000000000000000000000000000000..c85a5bacce4f1728e032da6278aa4a56d5c920d1 GIT binary patch literal 2800 zcmbtV4NTN!7=PdYft&}?fux8#J~i2iTncfT8y|Ndh#0~6%SY(Rhanta9$beG3_n1} znzpvK+G?{VEw2x1vC(Ro)kd}1m`4jIvbC(u%?5-BNznDX`@he0r?=Spv*(`o`8~hi z^E~hS{~*^?5zE>AD@ZIQ;a3PxoUr@UN3KXmeBYQv;bLK^s;)U$cc{LhvALzy*4A!! zD2Ygs4-~#bc&gX@b(%VV(z z1n%odNAswC^goF>LU@D4v(Xpou+gRmce-BWX`h24RtZ1w`iv%FL*^%M>a}V2@^ zvl><16*cPMvd>)(h>U z9Mz@$X?*IlY4`gT^5TFq>aw#zW|!SgTzlDci!5{Ib}tkNKFujf>?9!W`Qrgh!dl`h zU@9g_Y9?XD1Mw4rbPy~=Vmt^Iiu6h)tws7VSUBu)z!89%iiaV@-Q=rak(~2YoJ&z& zh4h5Nh(^7&uxQjDX?v@rvX2sz($1WP;7^#L$Mu}iS!Pn z(awZ)1}qzv1$ZZ572{@vc_8Fql6=59V&($iyI{}4b_0LTQw5dxAT9y75LSXiDunGt zzDOxohO~^ZLTOrxuo8@NDE9*{$KCUs%mIbH59w+k79373ARK|kdlLK*1U0Y*w5b83 z5$l7nMy%_AHKSO)SHW8$Yl7KeEwFajVWeMF(rpMkFoGRzj{@?vw<}}tPJ~C$`xtPh zl8%VIcf<28bUDFMr%RdW6xRH>=LZ6gM-URvm8duh89ndx-qHIlXpG)(ihVtOcSLzc z?;+ZHFN%G26qAnl{$lDwex)ctx^XJ#$CHluzA=3h2&eO)?=|Vl1C}c%0=Fd%Y#r|@>GX-6bt!7j&#KL7t?ueu~g%0ZEF|*ju`!apyz^~ zAF8L_&{1W6U{1pQ^`Sg@UmoEPZA1W+_vcARdAu*)i(-G?SNC7Mfm);0bM_`7q{OrFafdE1>4Ucb(Vmu1eycaX7JQHbWfB_7aN!RIIEG>b#O8b>;_9Jj|l|8cVQ z=EtF%J{_2IFU{NuPQJIhwJp{8?MIpqx}Q3}n7`V^h8FnP)t0b3Wq-H~0qN;CCQLE^ zOt%}a^d*fBo{C!-PySwYE_r;%fzz6sKSw2K%7(%R&ldi@m>N7*WHEFZv|pte>@V#z zT<@~F|BTM=ctuxmKK$HF)LC7;V@UVAE#|M_>0w>#$r5!#ZcoQMy2L8S(qzwPDL;fx z9BUX%jy0c=p0~Q&)LZ7(Tx+{)sd9W-S{mz!`OrSHG+4;z*PLjYFAbXdZmMgl=Nr@2 z>l5?Qwrit@?Yj5wnn!agyZ7|0cK>4e!gW62*1LU4;tc+e%Uq_Ip08!nmre86|6fl` B7pDLK literal 0 HcmV?d00001 diff --git a/test/primers/y.ri b/test/primers/y.ri new file mode 100644 index 0000000000000000000000000000000000000000..c466b0f0ac3efd541683e311cb09da9308dc0cde GIT binary patch literal 905 zcmccNXr9f;z`$Su#K<6@4Z`CzxzTg0?Bh!wnH2`7P6#$H;*+pe%d0uqytrFVw`RkQ zFUhO|z8gYJc!acD1=^GXW2B6Vu5l_&6PcwD*%EeW(o}(2CPg1qmMUwgi@GRjaym|( zIAu-CmM4P3?hLa{iaxzpRi5E5b~HrA_~O2qzcQ3&1qI(w)M{HLvMS*`SF3hQk>7<* z*F)zx8Y`GyaUOg=EtUV?-xBsKj=S|7r7IHEryQJK>MY)%xJto?$&llijDhv5n2mtvO2pG3RXa>exk%{38PkBX#A_3LV zP=gPZHoYg=+R>#n`AE%!DOI=vwGwi(Dwn8BP-3t%*q z%fR5k1YvSAI7qU zVoGgGN6&=GQ>V|Ivv}p&4V$*HGH5^po`Zpnor{}?k6%DgNLWNnTtZSxMpjN*v#C*%Es2t-qG2`)y>_*)7!__&p$9II3zSIJTfXeCN@4HF)2AMJtH$K zH!r`Su&B7Sth}m21|n NTfcGB=54^h1OQN{@X7!H literal 0 HcmV?d00001 diff --git a/test/t/55_vg_primers.t b/test/t/55_vg_primers.t index dcf5ea33014..717b8dbc0ec 100644 --- a/test/t/55_vg_primers.t +++ b/test/t/55_vg_primers.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 13 +plan tests 11 # make graph and snarl dist index vg construct -r small/y.fa -v small/y.vcf.gz > y.vg @@ -14,22 +14,20 @@ vg convert -x y.vg > y.xg vg index -j y.dist y.vg -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist | wc -l) 6 "Get the expected number of primer pairs" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -a | wc -l) 6 "Get the expected number of primer pairs using --all-primers tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -z | wc -l) 1 "Get the expected number of primer pairs using --zero-variation tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -l 2 | wc -l) 3 "Get the expected number of primer pairs using --tolerance tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -m 140 | wc -l) 4 "Get the expected number of primer pairs using --maximum-size tag" - -is $(vg primers primers/y.split.out -x y.xg -s y.dist | wc -l) 9 "Get the expected number of primer pairs" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -a | wc -l) 11 "Get the expected number of primer pairs using --all-primers tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -z | wc -l) 1 "Get the expected number of primer pairs using --zero-variation tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -l 2 | wc -l) 6 "Get the expected number of primer pairs using --tolerance tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -m 140 | wc -l) 7 "Get the expected number of primer pairs using --maximum-size tag" - -vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist > y.ref_pos_0.out -vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -s y.dist > y.ref_pos_11.out +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz | wc -l) 6 "Get the expected number of primer pairs" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -a | wc -l) 6 "Get the expected number of primer pairs using --all-primers tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -l 2 | wc -l) 3 "Get the expected number of primer pairs using --tolerance tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -m 140 | wc -l) 4 "Get the expected number of primer pairs using --maximum-size tag" + +is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz | wc -l) 9 "Get the expected number of primer pairs" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -a | wc -l) 11 "Get the expected number of primer pairs using --all-primers tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -l 2 | wc -l) 6 "Get the expected number of primer pairs using --tolerance tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" +is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -m 140 | wc -l) 7 "Get the expected number of primer pairs using --maximum-size tag" + +vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz> y.ref_pos_0.out +vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz > y.ref_pos_11.out diff -q <(awk '{$2=$3=$6=$7=""; print $0}' y.ref_pos_0.out) <(awk '{$2=$3=$6=$7=""; print $0}' y.ref_pos_11.out) > diff_0_11 is $(cat diff_0_11 | wc -l) 0 "These two output files should have identical primers except for their positions on template" From d7b6fc9d9e6483267cc94c5e169e78c61737d99f Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 30 Dec 2024 18:11:50 +0100 Subject: [PATCH 26/43] Fix typos --- src/primer_filter.hpp | 2 +- src/subcommand/primers_main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index fa2fec23ec2..6d3647a624d 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -192,4 +192,4 @@ class PrimerFinder { } -#endif /* primder_filter_hpp */ \ No newline at end of file +#endif /* primer_filter_hpp */ diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index d651e272104..c60587882ba 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -20,7 +20,7 @@ void help_primers(char** argv) { << " -s, --snarl-index FILE use this snarl index" << endl << " -r, --r-index FILE use this r index" << endl << " -g, --gbz FILE use this gbz file" << endl - << " -v, --variation-threshold DOUBLE output primers that work for at least this percentage of haplotypes (default: 0.8)" + << " -v, --variation-threshold DOUBLE output primers that work for at least this percentage of haplotypes (default: 0.8)" << endl << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl << " -m, --maximum-size INT maximum product size allowed (has precedence over --tolerance)" << endl From 41953dc65c61af1d8ef4fb23d55c39f6d1007f46 Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 30 Dec 2024 18:12:29 +0100 Subject: [PATCH 27/43] Add wiki page for primer filtering but in the wrong directory --- doc/primer_filter.md | 66 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 doc/primer_filter.md diff --git a/doc/primer_filter.md b/doc/primer_filter.md new file mode 100644 index 00000000000..0ec1f78295e --- /dev/null +++ b/doc/primer_filter.md @@ -0,0 +1,66 @@ + +`vg primers` can be used to filter PRC primers based on properties of the pangenome such as whether there are variations in the primers and the possible lengths of the PRC product. +`vg primers` takes pangenome indexes and the output of `primer3` as input and outputs a `.tsv` file of the input primers and properties from the pangenome. + +# Get primers with `primer3` + +The input to `vg primers` is the output of the command line version of [`primer3`](https://github.com/primer3-org/primer3). + +`primer3` requires a config file formatted like: +``` +SEQUENCE_ID=CHM13#0#chr17|BRCA1P1|exon_1|44026826 +SEQUENCE_TEMPLATE=CATGT... +PRIMER_NUM_RETURN=10 +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 += +``` + +The `SEQUENCE_ID` field must be formatted correctly for `vg primers` to find the correct location of the primers in the pangenome. +There are four fields in `SEQUENCE_ID` separated by `|`. +They are the reference path name in the graph, the name of the gene, the exon or intron, and the offset of the sequence in the path. +The names of the reference paths in the graph can be found with `vg paths -L -R`. +The `SEQUENCE_TEMPLATE` is the nucleotide sequence the primers are found from. + +`primer3` must be run with the `--format_output` flag. + +# Filtering primers with `vg primers` + +`vg primers` requires the following indexes of the pangenome: + +- the *xg* index created with `vg index -x` +- the *distance*/*snarl* index created with `vg index -j` +- the [*r-index*](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) created with `vg gbwt -r` +- the [*gbz*](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) created with `vg gbwt` + +# Interpreting output of `vg primers` + +`vg primers` outputs a tsv file with the following fields for each primer: + +| field | definition | description | +| ---------- | ---------- | ----------- | +| chrom | chromosome | reference path name, the first field in `SEQUENCE_ID` | +| tplfeat | template feature | the second and third fields in `SEQUENCE_ID` | +| tplpos | template position | offset along the reference path, the fourth field in `SEQUENCE_ID` | +| lpseq | left primer sequence | the nucleotide sequence of the left primer | +| rpseq | right primer sequence | the nucleotide sequence of the right primer | +| lppostpl | left primer position template | the offset of the left primer in the template sequence | +| rppostmp | right primer position template | the offset of the right primer in the template sequence | +| lpposchrom | left primer position chromosome | the offset of the left primer in the reference path | +| rpposchrom | right primer position chromosome | the offset of the right primer in the reference path | +| pnid | left primer mapped node ids | the node ids that the left primer overlaps in the graph | +| rpnid | right primer mapped node ids | the node ids that the right primer overlaps in the graph | +| lplen | left primer length | the length in nucleotides of the left primer | +| rplen | right primer length | the length in nucleotides of the right primer | +| linsize | linear product size | the length of the product in the linear genome (including primer lengths) | +| minsize | minimum product size | the minimum length of the product according to the pangenome | +| maxsize | maximum product size | the maximum length of the product according to the pangenome | +| varlevel | variation level | a measure of variation in the primers (the number of unique haplotypes / the total number of haplotypes) | + From 6f37a843d31527e786adffb89bc93e26c6907a04 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 3 Jan 2025 20:15:53 +0100 Subject: [PATCH 28/43] Make primer finder use the unformatted primer3 output --- doc/primer_filter.md | 2 - src/primer_filter.cpp | 158 +++++++++----- test/primers/y.primer3_with_ref_pos.out | 231 ++++++++++++--------- test/primers/y.primer3_with_ref_pos_11.out | 231 ++++++++++++--------- test/primers/y.primer3config.split | 8 +- 5 files changed, 377 insertions(+), 253 deletions(-) diff --git a/doc/primer_filter.md b/doc/primer_filter.md index 0ec1f78295e..110655ac01d 100644 --- a/doc/primer_filter.md +++ b/doc/primer_filter.md @@ -29,8 +29,6 @@ They are the reference path name in the graph, the name of the gene, the exon or The names of the reference paths in the graph can be found with `vg paths -L -R`. The `SEQUENCE_TEMPLATE` is the nucleotide sequence the primers are found from. -`primer3` must be run with the `--format_output` flag. - # Filtering primers with `vg primers` `vg primers` requires the following indexes of the pangenome: diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 7a3290ec4f3..7f8be038bb9 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -5,6 +5,8 @@ namespace vg { using namespace std; +//#define DEBUG_PRIMER_FILTER + // Constructor PrimerFinder::PrimerFinder(const unique_ptr& graph_param, const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle, @@ -56,64 +58,108 @@ void PrimerFinder::load_primers(ifstream& file_handle) { //ifstream file_handle(path_to_primers); assert(file_handle.is_open()); - vector cur_fields; - size_t cur_template_offset; - string cur_template_info; - string cur_template_feature; - string cur_path; + + // Regular expressions for matching fields with numbers + std::regex left_seq ("^PRIMER_LEFT_[0-9]*_SEQUENCE.*"); + std::regex right_seq ("^PRIMER_RIGHT_[0-9]*_SEQUENCE.*"); + std::regex left_primer_position ("^PRIMER_LEFT_[0-9]*=.*"); + std::regex right_primer_position ("^PRIMER_RIGHT_[0-9]*=.*"); + + vector::iterator curr_primer_iterator; + + string chromosome_name = ""; + string template_feature = ""; + size_t template_position = std::numeric_limits::max(); + string line; while (getline(file_handle, line)) { line = strip(line); - size_t left_primer_line_start = line.find("LEFT PRIMER"); - size_t right_primer_line_start = line.find("RIGHT PRIMER"); - - if (startswith(line, "PRIMER PICKING RESULTS FOR")) { - if (chroms.size() != 0) { - assert(chroms[cur_path].back().right_primer.sequence.empty()); - chroms[cur_path].pop_back(); + + if (line == "=") { + //End of the record for one primer pair + chromosome_name = ""; + template_feature = ""; + template_position = std::numeric_limits::max(); + } else if (startswith(line, "SEQUENCE_ID")) { + //Get the path, path offset, and features from the sequence_id of the primer pair + //This will be the same for all primer pairs up to the next "=" + vector cur_fields = move(split(split(line,'=')[1], '|')); + + chromosome_name = cur_fields[0]; + template_feature = cur_fields[1] + "|" + cur_fields[2]; + template_position = stoi(cur_fields[3]); +#ifdef DEBUG_PRIMER_FILTER + cerr << "FIND PRIMERS FOR INPUT " << line << ": " << chromosome_name << ", " << template_feature << ", " << template_position << endl; +#endif + + } else if (startswith(line, "SEQUENCE_TEMPLATE")) { + //If the path from the sequence id isn't in the graph, then get the path and path offset by mapping the sequence + string seq = split(line,'=')[1]; + //TODO: Actually do this + + } else if (startswith(line, "PRIMER_PAIR_NUM_RETURNED")) { + //How many primer pairs for this sequence template? + + size_t primer_pair_count = stoi(split(line,'=')[1]); + size_t new_vector_start = chroms[chromosome_name].size(); + + //Add all new primer pairs for this template + chroms.reserve(new_vector_start + primer_pair_count); + for (size_t i = 0 ; i < primer_pair_count ; i++) { + chroms[chromosome_name].emplace_back(); + chroms[chromosome_name].back().chromosome_name = chromosome_name; + chroms[chromosome_name].back().template_position = template_position; + chroms[chromosome_name].back().template_feature = template_feature; + chroms[chromosome_name].back().right_primer.left = false; } - cur_fields = move(split(line)); - cur_template_info = cur_fields[cur_fields.size()-1]; - cur_fields = move(split(cur_template_info,'|')); - cur_template_feature = cur_fields[1] + "|" + cur_fields[2]; - cur_template_offset = stoi(cur_fields[3]); - cur_path = cur_fields[0]; - chroms[cur_path].emplace_back(); - chroms[cur_path].back().chromosome_name = cur_path; - chroms[cur_path].back().template_position = cur_template_offset; - chroms[cur_path].back().template_feature = cur_template_feature; - chroms[cur_path].back().right_primer.left = false; - } else if (left_primer_line_start != string::npos) { - cur_fields = move(split(line.substr(left_primer_line_start, line.size()))); - PrimerPair& primer_pair = chroms[cur_path].back(); - primer_pair.left_primer.position_chromosome = stoi(cur_fields[2]) + cur_template_offset; - primer_pair.left_primer.position_template = stoi(cur_fields[2]); - primer_pair.left_primer.sequence = cur_fields[9]; - primer_pair.left_primer.length = stoi(cur_fields[3]); - } else if (startswith(line, "RIGHT PRIMER")) { - cur_fields = move(split(line.substr(right_primer_line_start, line.size()))); - PrimerPair& primer_pair = chroms[cur_path].back(); - primer_pair.right_primer.position_chromosome = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1 + cur_template_offset; - primer_pair.right_primer.position_template = stoi(cur_fields[2]) - stoi(cur_fields[3]) + 1; - primer_pair.right_primer.sequence = cur_fields[9]; - primer_pair.right_primer.length = stoi(cur_fields[3]); - - assert(!primer_pair.left_primer.sequence.empty()); - map_to_nodes(primer_pair.left_primer, cur_path); - map_to_nodes(primer_pair.right_primer, cur_path); - primer_pair.linear_product_size = primer_pair.right_primer.position_template - - primer_pair.left_primer.position_template + primer_pair.right_primer.length; - update_variation(primer_pair, cur_path); - update_min_max_product_size(primer_pair); - chroms[cur_path].emplace_back(); - chroms[cur_path].back().chromosome_name = cur_path; - chroms[cur_path].back().template_position = cur_template_offset; - chroms[cur_path].back().template_feature = cur_template_feature; - chroms[cur_path].back().right_primer.left = false; + + //Set the current primer pair iterator to the first new pair + curr_primer_iterator = chroms[chromosome_name].begin() + new_vector_start; + } else if (std::regex_match(line, left_seq)) { + curr_primer_iterator->left_primer.sequence = split(line, '=')[1]; +#ifdef DEBUG_PRIMER_FILTER + cerr << "\tGet left sequence " << line << ": " << curr_primer_iterator->left_primer.sequence << endl; +#endif + } else if (std::regex_match(line, right_seq)) { + curr_primer_iterator->right_primer.sequence = split(line, '=')[1]; +#ifdef DEBUG_PRIMER_FILTER + cerr << "\tGet right sequence " << line << ": " << curr_primer_iterator->left_primer.sequence << endl; +#endif + } else if (std::regex_match(line, left_primer_position)) { + //Start position and length of the left primer + curr_primer_iterator->left_primer.position_template = stoi(split(split(line, '=')[1], ',')[0]); + curr_primer_iterator->left_primer.length = stoi(split(split(line, '=')[1], ',')[1]); + curr_primer_iterator->left_primer.position_chromosome = curr_primer_iterator->left_primer.position_template + template_position; +#ifdef DEBUG_PRIMER_FILTER + cerr << "old template position " << template_position << endl; + cerr << "\tGet left primer position" << line << ": " << curr_primer_iterator->left_primer.position_template << ", " + << curr_primer_iterator->left_primer.length << ", " + << curr_primer_iterator->left_primer.position_chromosome << endl; +#endif + } else if (std::regex_match(line, right_primer_position)) { +#ifdef DEBUG_PRIMER_FILTER + cerr << "\tGet right primer position" << line << ": " << curr_primer_iterator->left_primer.position_chromosome << endl; +#endif + //Start position and length of the right primer + size_t right_primer_offset = stoi(split(split(line, '=')[1], ',')[0]); + curr_primer_iterator->right_primer.length = stoi(split(split(line, '=')[1], ',')[1]); + curr_primer_iterator->right_primer.position_chromosome = right_primer_offset - curr_primer_iterator->right_primer.length + 1 + template_position; + curr_primer_iterator->right_primer.position_template = right_primer_offset - curr_primer_iterator->right_primer.length + 1; + + //This is the last thing for this primer pair, so update the primer pair + map_to_nodes(curr_primer_iterator->left_primer, chromosome_name); + map_to_nodes(curr_primer_iterator->right_primer, chromosome_name); + + curr_primer_iterator->linear_product_size = curr_primer_iterator->right_primer.position_template + - curr_primer_iterator->left_primer.position_template + curr_primer_iterator->right_primer.length; + update_variation(*curr_primer_iterator, chromosome_name); + update_min_max_product_size(*curr_primer_iterator); + + //Iterator to the new primer pair + curr_primer_iterator++; } + } - assert(chroms[cur_path].back().right_primer.sequence.empty()); - chroms[cur_path].pop_back(); } const size_t PrimerFinder::total_reference_paths() const { @@ -227,6 +273,9 @@ void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { } void PrimerFinder::map_to_nodes(Primer& primer, const string& path_name) { +#ifdef DEBUG_PRIMER_FILTER + cerr << "Map to nodes for primer " << primer.sequence << endl; +#endif path_handle_t reference_path_handle = graph->get_path_handle(path_name); string primer_seq; if (primer.left) { @@ -335,6 +384,9 @@ static void sa_to_da(std::vector& sequences } void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_name) { +#ifdef DEBUG_PRIMER_FILTER + cerr << "Update variation" << endl; +#endif const vector& left_primer_node_ids = primer_pair.left_primer.mapped_nodes_ids; const vector& right_primer_node_ids = primer_pair.right_primer.mapped_nodes_ids; vector nodes_id; @@ -406,4 +458,4 @@ bool PrimerFinder::startswith(const string& str, const string& prefix) { return str.compare(0, prefix.length(), prefix) == 0; } -} \ No newline at end of file +} diff --git a/test/primers/y.primer3_with_ref_pos.out b/test/primers/y.primer3_with_ref_pos.out index f4fc72bc305..daf5dce0c09 100644 --- a/test/primers/y.primer3_with_ref_pos.out +++ b/test/primers/y.primer3_with_ref_pos.out @@ -1,97 +1,134 @@ -PRIMER PICKING RESULTS FOR y|gene|feature|0 - -No mispriming library specified -Using 0-based sequence positions -OLIGO start len tm gc% any_th 3'_th hairpin seq -LEFT PRIMER 362 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC -RIGHT PRIMER 485 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG -SEQUENCE SIZE: 1001 -INCLUDED REGION SIZE: 1001 - -PRODUCT SIZE: 124, PAIR ANY_TH COMPL: 6.57, PAIR 3'_TH COMPL: 4.13 - - 0 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTG - - - 60 CTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAG - - - 120 TGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTG - - - 180 GAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGA - - - 240 ACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTA - - - 300 CTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAAT - - - 360 AGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATC - >>>>>>>>>>>>>>>>>>>> - - 420 CTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCT - <<<<<<<<<<<<<< - - 480 TCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGT - <<<<<< - - 540 AGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTA - - - 600 AGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGT - - - 660 GTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTT - - - 720 TTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGAT - - - 780 CTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCT - - - 840 CAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAG - - - 900 CTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTT - - - 960 TAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA - - -KEYS (in order of precedence): ->>>>>> left primer -<<<<<< right primer - -ADDITIONAL OLIGOS - start len tm gc% any_th 3'_th hairpin seq - - 1 LEFT PRIMER 620 20 59.83 60.00 15.95 0.00 35.67 GAGTCGAGGCTCAAGGACAG - RIGHT PRIMER 764 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 145, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 2 LEFT PRIMER 618 20 59.83 60.00 15.95 13.47 37.94 CAGAGTCGAGGCTCAAGGAC - RIGHT PRIMER 764 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 147, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 3 LEFT PRIMER 625 20 59.82 60.00 10.25 0.00 37.05 GAGGCTCAAGGACAGCTCTC - RIGHT PRIMER 764 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 140, PAIR ANY_TH COMPL: 26.57, PAIR 3'_TH COMPL: 26.57 - - 4 LEFT PRIMER 819 20 59.67 55.00 1.00 0.00 34.56 TCCAGAAGCTGCTCTTTCCC - RIGHT PRIMER 954 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT - PRODUCT SIZE: 136, PAIR ANY_TH COMPL: 13.72, PAIR 3'_TH COMPL: 10.53 - -Statistics - con too in in not no tm tm high high high high - sid many tar excl ok bad GC too too any_th 3'_th hair- poly end - ered Ns get reg reg GC% clamp low high compl compl pin X stab ok -Left 4586 0 0 0 0 41 0 3329 93 0 0 6 15 0 1102 -Right 4585 0 0 0 0 40 0 3257 106 0 0 0 15 0 1167 -Pair Stats: -considered 106, unacceptable product size 101, ok 5 -libprimer3 release 2.6.1 - - +SEQUENCE_ID=y|gene|feature|0 +SEQUENCE_TEMPLATE=CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 +PRIMER_LEFT_EXPLAIN=considered 4586, GC content failed 41, low tm 3329, high tm 93, high hairpin stability 6, long poly-x seq 15, ok 1102 +PRIMER_RIGHT_EXPLAIN=considered 4585, GC content failed 40, low tm 3257, high tm 106, long poly-x seq 15, ok 1167 +PRIMER_PAIR_EXPLAIN=considered 106, unacceptable product size 101, ok 5 +PRIMER_LEFT_NUM_RETURNED=5 +PRIMER_RIGHT_NUM_RETURNED=5 +PRIMER_INTERNAL_NUM_RETURNED=0 +PRIMER_PAIR_NUM_RETURNED=5 +PRIMER_PAIR_0_PENALTY=0.214768 +PRIMER_LEFT_0_PENALTY=0.107017 +PRIMER_RIGHT_0_PENALTY=0.107752 +PRIMER_LEFT_0_SEQUENCE=TGCCTGGCATAGAGGAAAGC +PRIMER_RIGHT_0_SEQUENCE=GCCAGAAGAGCCTCAAGGAG +PRIMER_LEFT_0=362,20 +PRIMER_RIGHT_0=485,20 +PRIMER_LEFT_0_TM=60.107 +PRIMER_RIGHT_0_TM=60.108 +PRIMER_LEFT_0_GC_PERCENT=55.000 +PRIMER_RIGHT_0_GC_PERCENT=60.000 +PRIMER_LEFT_0_SELF_ANY_TH=19.30 +PRIMER_RIGHT_0_SELF_ANY_TH=0.00 +PRIMER_LEFT_0_SELF_END_TH=0.00 +PRIMER_RIGHT_0_SELF_END_TH=0.00 +PRIMER_LEFT_0_HAIRPIN_TH=31.71 +PRIMER_RIGHT_0_HAIRPIN_TH=37.39 +PRIMER_LEFT_0_END_STABILITY=3.5100 +PRIMER_RIGHT_0_END_STABILITY=3.6900 +PRIMER_PAIR_0_COMPL_ANY_TH=6.57 +PRIMER_PAIR_0_COMPL_END_TH=4.13 +PRIMER_PAIR_0_PRODUCT_SIZE=124 +PRIMER_PAIR_0_PRODUCT_TM=81.8 +PRIMER_PAIR_1_PENALTY=0.351214 +PRIMER_LEFT_1_PENALTY=0.172352 +PRIMER_RIGHT_1_PENALTY=0.178861 +PRIMER_LEFT_1_SEQUENCE=GAGTCGAGGCTCAAGGACAG +PRIMER_RIGHT_1_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_1=620,20 +PRIMER_RIGHT_1=764,20 +PRIMER_LEFT_1_TM=59.828 +PRIMER_RIGHT_1_TM=60.179 +PRIMER_LEFT_1_GC_PERCENT=60.000 +PRIMER_RIGHT_1_GC_PERCENT=55.000 +PRIMER_LEFT_1_SELF_ANY_TH=15.95 +PRIMER_RIGHT_1_SELF_ANY_TH=0.00 +PRIMER_LEFT_1_SELF_END_TH=0.00 +PRIMER_RIGHT_1_SELF_END_TH=0.00 +PRIMER_LEFT_1_HAIRPIN_TH=35.67 +PRIMER_RIGHT_1_HAIRPIN_TH=0.00 +PRIMER_LEFT_1_END_STABILITY=3.5100 +PRIMER_RIGHT_1_END_STABILITY=4.2000 +PRIMER_PAIR_1_COMPL_ANY_TH=0.00 +PRIMER_PAIR_1_COMPL_END_TH=0.00 +PRIMER_PAIR_1_PRODUCT_SIZE=145 +PRIMER_PAIR_1_PRODUCT_TM=83.5 +PRIMER_PAIR_2_PENALTY=0.351214 +PRIMER_LEFT_2_PENALTY=0.172352 +PRIMER_RIGHT_2_PENALTY=0.178861 +PRIMER_LEFT_2_SEQUENCE=CAGAGTCGAGGCTCAAGGAC +PRIMER_RIGHT_2_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_2=618,20 +PRIMER_RIGHT_2=764,20 +PRIMER_LEFT_2_TM=59.828 +PRIMER_RIGHT_2_TM=60.179 +PRIMER_LEFT_2_GC_PERCENT=60.000 +PRIMER_RIGHT_2_GC_PERCENT=55.000 +PRIMER_LEFT_2_SELF_ANY_TH=15.95 +PRIMER_RIGHT_2_SELF_ANY_TH=0.00 +PRIMER_LEFT_2_SELF_END_TH=13.47 +PRIMER_RIGHT_2_SELF_END_TH=0.00 +PRIMER_LEFT_2_HAIRPIN_TH=37.94 +PRIMER_RIGHT_2_HAIRPIN_TH=0.00 +PRIMER_LEFT_2_END_STABILITY=3.8500 +PRIMER_RIGHT_2_END_STABILITY=4.2000 +PRIMER_PAIR_2_COMPL_ANY_TH=0.00 +PRIMER_PAIR_2_COMPL_END_TH=0.00 +PRIMER_PAIR_2_PRODUCT_SIZE=147 +PRIMER_PAIR_2_PRODUCT_TM=83.6 +PRIMER_PAIR_3_PENALTY=0.354392 +PRIMER_LEFT_3_PENALTY=0.175531 +PRIMER_RIGHT_3_PENALTY=0.178861 +PRIMER_LEFT_3_SEQUENCE=GAGGCTCAAGGACAGCTCTC +PRIMER_RIGHT_3_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_3=625,20 +PRIMER_RIGHT_3=764,20 +PRIMER_LEFT_3_TM=59.824 +PRIMER_RIGHT_3_TM=60.179 +PRIMER_LEFT_3_GC_PERCENT=60.000 +PRIMER_RIGHT_3_GC_PERCENT=55.000 +PRIMER_LEFT_3_SELF_ANY_TH=10.25 +PRIMER_RIGHT_3_SELF_ANY_TH=0.00 +PRIMER_LEFT_3_SELF_END_TH=0.00 +PRIMER_RIGHT_3_SELF_END_TH=0.00 +PRIMER_LEFT_3_HAIRPIN_TH=37.05 +PRIMER_RIGHT_3_HAIRPIN_TH=0.00 +PRIMER_LEFT_3_END_STABILITY=3.2000 +PRIMER_RIGHT_3_END_STABILITY=4.2000 +PRIMER_PAIR_3_COMPL_ANY_TH=26.57 +PRIMER_PAIR_3_COMPL_END_TH=26.57 +PRIMER_PAIR_3_PRODUCT_SIZE=140 +PRIMER_PAIR_3_PRODUCT_TM=83.2 +PRIMER_PAIR_4_PENALTY=0.360353 +PRIMER_LEFT_4_PENALTY=0.326264 +PRIMER_RIGHT_4_PENALTY=0.034089 +PRIMER_LEFT_4_SEQUENCE=TCCAGAAGCTGCTCTTTCCC +PRIMER_RIGHT_4_SEQUENCE=GCCTGGGTAGCTTTGGATGT +PRIMER_LEFT_4=819,20 +PRIMER_RIGHT_4=954,20 +PRIMER_LEFT_4_TM=59.674 +PRIMER_RIGHT_4_TM=60.034 +PRIMER_LEFT_4_GC_PERCENT=55.000 +PRIMER_RIGHT_4_GC_PERCENT=55.000 +PRIMER_LEFT_4_SELF_ANY_TH=1.00 +PRIMER_RIGHT_4_SELF_ANY_TH=0.00 +PRIMER_LEFT_4_SELF_END_TH=0.00 +PRIMER_RIGHT_4_SELF_END_TH=0.00 +PRIMER_LEFT_4_HAIRPIN_TH=34.56 +PRIMER_RIGHT_4_HAIRPIN_TH=0.00 +PRIMER_LEFT_4_END_STABILITY=3.9700 +PRIMER_RIGHT_4_END_STABILITY=3.0600 +PRIMER_PAIR_4_COMPL_ANY_TH=13.72 +PRIMER_PAIR_4_COMPL_END_TH=10.53 +PRIMER_PAIR_4_PRODUCT_SIZE=136 +PRIMER_PAIR_4_PRODUCT_TM=83.6 += diff --git a/test/primers/y.primer3_with_ref_pos_11.out b/test/primers/y.primer3_with_ref_pos_11.out index 2ac75ef9695..952f6b1ec16 100644 --- a/test/primers/y.primer3_with_ref_pos_11.out +++ b/test/primers/y.primer3_with_ref_pos_11.out @@ -1,97 +1,134 @@ -PRIMER PICKING RESULTS FOR y|gene|feature|11 - -No mispriming library specified -Using 0-based sequence positions -OLIGO start len tm gc% any_th 3'_th hairpin seq -LEFT PRIMER 351 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC -RIGHT PRIMER 474 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG -SEQUENCE SIZE: 990 -INCLUDED REGION SIZE: 990 - -PRODUCT SIZE: 124, PAIR ANY_TH COMPL: 6.57, PAIR 3'_TH COMPL: 4.13 - - 0 TGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAAC - - - 60 TAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGAC - - - 120 AATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAA - - - 180 ATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTT - - - 240 AATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGA - - - 300 GGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCA - >>>>>>>>> - - 360 TAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCT - >>>>>>>>>>> - - 420 TGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTC - <<<<<<<<<<<<<<<<<<<< - - 480 ATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGAT - - - 540 CTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGG - - - 600 GCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGT - - - 660 AACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACA - - - 720 TGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGG - - - 780 CCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCAT - - - 840 TTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCC - - - 900 TCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTG - - - 960 TGGACTAAGGACAAAGGTGCGGGGAGATGA - - -KEYS (in order of precedence): ->>>>>> left primer -<<<<<< right primer - -ADDITIONAL OLIGOS - start len tm gc% any_th 3'_th hairpin seq - - 1 LEFT PRIMER 609 20 59.83 60.00 15.95 0.00 35.67 GAGTCGAGGCTCAAGGACAG - RIGHT PRIMER 753 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 145, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 2 LEFT PRIMER 607 20 59.83 60.00 15.95 13.47 37.94 CAGAGTCGAGGCTCAAGGAC - RIGHT PRIMER 753 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 147, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 3 LEFT PRIMER 614 20 59.82 60.00 10.25 0.00 37.05 GAGGCTCAAGGACAGCTCTC - RIGHT PRIMER 753 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 140, PAIR ANY_TH COMPL: 26.57, PAIR 3'_TH COMPL: 26.57 - - 4 LEFT PRIMER 808 20 59.67 55.00 1.00 0.00 34.56 TCCAGAAGCTGCTCTTTCCC - RIGHT PRIMER 943 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT - PRODUCT SIZE: 136, PAIR ANY_TH COMPL: 13.72, PAIR 3'_TH COMPL: 10.53 - -Statistics - con too in in not no tm tm high high high high - sid many tar excl ok bad GC too too any_th 3'_th hair- poly end - ered Ns get reg reg GC% clamp low high compl compl pin X stab ok -Left 4531 0 0 0 0 41 0 3277 93 0 0 6 15 0 1099 -Right 4530 0 0 0 0 40 0 3203 106 0 0 0 15 0 1166 -Pair Stats: -considered 106, unacceptable product size 101, ok 5 -libprimer3 release 2.6.1 - - +SEQUENCE_ID=y|gene|feature|11 +SEQUENCE_TEMPLATE=TGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 +PRIMER_LEFT_EXPLAIN=considered 4531, GC content failed 41, low tm 3277, high tm 93, high hairpin stability 6, long poly-x seq 15, ok 1099 +PRIMER_RIGHT_EXPLAIN=considered 4530, GC content failed 40, low tm 3203, high tm 106, long poly-x seq 15, ok 1166 +PRIMER_PAIR_EXPLAIN=considered 106, unacceptable product size 101, ok 5 +PRIMER_LEFT_NUM_RETURNED=5 +PRIMER_RIGHT_NUM_RETURNED=5 +PRIMER_INTERNAL_NUM_RETURNED=0 +PRIMER_PAIR_NUM_RETURNED=5 +PRIMER_PAIR_0_PENALTY=0.214768 +PRIMER_LEFT_0_PENALTY=0.107017 +PRIMER_RIGHT_0_PENALTY=0.107752 +PRIMER_LEFT_0_SEQUENCE=TGCCTGGCATAGAGGAAAGC +PRIMER_RIGHT_0_SEQUENCE=GCCAGAAGAGCCTCAAGGAG +PRIMER_LEFT_0=351,20 +PRIMER_RIGHT_0=474,20 +PRIMER_LEFT_0_TM=60.107 +PRIMER_RIGHT_0_TM=60.108 +PRIMER_LEFT_0_GC_PERCENT=55.000 +PRIMER_RIGHT_0_GC_PERCENT=60.000 +PRIMER_LEFT_0_SELF_ANY_TH=19.30 +PRIMER_RIGHT_0_SELF_ANY_TH=0.00 +PRIMER_LEFT_0_SELF_END_TH=0.00 +PRIMER_RIGHT_0_SELF_END_TH=0.00 +PRIMER_LEFT_0_HAIRPIN_TH=31.71 +PRIMER_RIGHT_0_HAIRPIN_TH=37.39 +PRIMER_LEFT_0_END_STABILITY=3.5100 +PRIMER_RIGHT_0_END_STABILITY=3.6900 +PRIMER_PAIR_0_COMPL_ANY_TH=6.57 +PRIMER_PAIR_0_COMPL_END_TH=4.13 +PRIMER_PAIR_0_PRODUCT_SIZE=124 +PRIMER_PAIR_0_PRODUCT_TM=81.8 +PRIMER_PAIR_1_PENALTY=0.351214 +PRIMER_LEFT_1_PENALTY=0.172352 +PRIMER_RIGHT_1_PENALTY=0.178861 +PRIMER_LEFT_1_SEQUENCE=GAGTCGAGGCTCAAGGACAG +PRIMER_RIGHT_1_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_1=609,20 +PRIMER_RIGHT_1=753,20 +PRIMER_LEFT_1_TM=59.828 +PRIMER_RIGHT_1_TM=60.179 +PRIMER_LEFT_1_GC_PERCENT=60.000 +PRIMER_RIGHT_1_GC_PERCENT=55.000 +PRIMER_LEFT_1_SELF_ANY_TH=15.95 +PRIMER_RIGHT_1_SELF_ANY_TH=0.00 +PRIMER_LEFT_1_SELF_END_TH=0.00 +PRIMER_RIGHT_1_SELF_END_TH=0.00 +PRIMER_LEFT_1_HAIRPIN_TH=35.67 +PRIMER_RIGHT_1_HAIRPIN_TH=0.00 +PRIMER_LEFT_1_END_STABILITY=3.5100 +PRIMER_RIGHT_1_END_STABILITY=4.2000 +PRIMER_PAIR_1_COMPL_ANY_TH=0.00 +PRIMER_PAIR_1_COMPL_END_TH=0.00 +PRIMER_PAIR_1_PRODUCT_SIZE=145 +PRIMER_PAIR_1_PRODUCT_TM=83.5 +PRIMER_PAIR_2_PENALTY=0.351214 +PRIMER_LEFT_2_PENALTY=0.172352 +PRIMER_RIGHT_2_PENALTY=0.178861 +PRIMER_LEFT_2_SEQUENCE=CAGAGTCGAGGCTCAAGGAC +PRIMER_RIGHT_2_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_2=607,20 +PRIMER_RIGHT_2=753,20 +PRIMER_LEFT_2_TM=59.828 +PRIMER_RIGHT_2_TM=60.179 +PRIMER_LEFT_2_GC_PERCENT=60.000 +PRIMER_RIGHT_2_GC_PERCENT=55.000 +PRIMER_LEFT_2_SELF_ANY_TH=15.95 +PRIMER_RIGHT_2_SELF_ANY_TH=0.00 +PRIMER_LEFT_2_SELF_END_TH=13.47 +PRIMER_RIGHT_2_SELF_END_TH=0.00 +PRIMER_LEFT_2_HAIRPIN_TH=37.94 +PRIMER_RIGHT_2_HAIRPIN_TH=0.00 +PRIMER_LEFT_2_END_STABILITY=3.8500 +PRIMER_RIGHT_2_END_STABILITY=4.2000 +PRIMER_PAIR_2_COMPL_ANY_TH=0.00 +PRIMER_PAIR_2_COMPL_END_TH=0.00 +PRIMER_PAIR_2_PRODUCT_SIZE=147 +PRIMER_PAIR_2_PRODUCT_TM=83.6 +PRIMER_PAIR_3_PENALTY=0.354392 +PRIMER_LEFT_3_PENALTY=0.175531 +PRIMER_RIGHT_3_PENALTY=0.178861 +PRIMER_LEFT_3_SEQUENCE=GAGGCTCAAGGACAGCTCTC +PRIMER_RIGHT_3_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_3=614,20 +PRIMER_RIGHT_3=753,20 +PRIMER_LEFT_3_TM=59.824 +PRIMER_RIGHT_3_TM=60.179 +PRIMER_LEFT_3_GC_PERCENT=60.000 +PRIMER_RIGHT_3_GC_PERCENT=55.000 +PRIMER_LEFT_3_SELF_ANY_TH=10.25 +PRIMER_RIGHT_3_SELF_ANY_TH=0.00 +PRIMER_LEFT_3_SELF_END_TH=0.00 +PRIMER_RIGHT_3_SELF_END_TH=0.00 +PRIMER_LEFT_3_HAIRPIN_TH=37.05 +PRIMER_RIGHT_3_HAIRPIN_TH=0.00 +PRIMER_LEFT_3_END_STABILITY=3.2000 +PRIMER_RIGHT_3_END_STABILITY=4.2000 +PRIMER_PAIR_3_COMPL_ANY_TH=26.57 +PRIMER_PAIR_3_COMPL_END_TH=26.57 +PRIMER_PAIR_3_PRODUCT_SIZE=140 +PRIMER_PAIR_3_PRODUCT_TM=83.2 +PRIMER_PAIR_4_PENALTY=0.360353 +PRIMER_LEFT_4_PENALTY=0.326264 +PRIMER_RIGHT_4_PENALTY=0.034089 +PRIMER_LEFT_4_SEQUENCE=TCCAGAAGCTGCTCTTTCCC +PRIMER_RIGHT_4_SEQUENCE=GCCTGGGTAGCTTTGGATGT +PRIMER_LEFT_4=808,20 +PRIMER_RIGHT_4=943,20 +PRIMER_LEFT_4_TM=59.674 +PRIMER_RIGHT_4_TM=60.034 +PRIMER_LEFT_4_GC_PERCENT=55.000 +PRIMER_RIGHT_4_GC_PERCENT=55.000 +PRIMER_LEFT_4_SELF_ANY_TH=1.00 +PRIMER_RIGHT_4_SELF_ANY_TH=0.00 +PRIMER_LEFT_4_SELF_END_TH=0.00 +PRIMER_RIGHT_4_SELF_END_TH=0.00 +PRIMER_LEFT_4_HAIRPIN_TH=34.56 +PRIMER_RIGHT_4_HAIRPIN_TH=0.00 +PRIMER_LEFT_4_END_STABILITY=3.9700 +PRIMER_RIGHT_4_END_STABILITY=3.0600 +PRIMER_PAIR_4_COMPL_ANY_TH=13.72 +PRIMER_PAIR_4_COMPL_END_TH=10.53 +PRIMER_PAIR_4_PRODUCT_SIZE=136 +PRIMER_PAIR_4_PRODUCT_TM=83.6 += diff --git a/test/primers/y.primer3config.split b/test/primers/y.primer3config.split index e34445c97b2..046319a2f20 100644 --- a/test/primers/y.primer3config.split +++ b/test/primers/y.primer3config.split @@ -1,5 +1,5 @@ -SEQUENCE_ID=y,100 -SEQUENCE_TEMPLATE=TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACA +SEQUENCE_ID=y|gene|feature|11 +SEQUENCE_TEMPLATE=TGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA PRIMER_TASK=generic PRIMER_PICK_LEFT_PRIMER=1 PRIMER_PICK_INTERNAL_OLIGO=0 @@ -10,8 +10,8 @@ PRIMER_MAX_SIZE=22 PRIMER_PRODUCT_SIZE_RANGE=75-150 PRIMER_EXPLAIN_FLAG=1 = -SEQUENCE_ID=y,601 -SEQUENCE_TEMPLATE=GACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +SEQUENCE_ID=y|gene|feature|0 +SEQUENCE_TEMPLATE=CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA PRIMER_TASK=generic PRIMER_PICK_LEFT_PRIMER=1 PRIMER_PICK_INTERNAL_OLIGO=0 From b841067409ac0d1bd4efdfb2bcbe7ef5896fa147 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 3 Jan 2025 23:01:52 +0100 Subject: [PATCH 29/43] Map template sequence if the given path isn't in the graph --- src/primer_filter.cpp | 65 ++++++++- src/primer_filter.hpp | 16 ++- src/subcommand/primers_main.cpp | 25 +++- src/unittest/primer_filter.cpp | 117 ++++++++++++++- test/primers/y.min | Bin 0 -> 32856 bytes .../y.primer3_with_ref_pos_11.nopath.out | 134 ++++++++++++++++++ 6 files changed, 344 insertions(+), 13 deletions(-) create mode 100644 test/primers/y.min create mode 100644 test/primers/y.primer3_with_ref_pos_11.nopath.out diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 7f8be038bb9..06de657a342 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -1,5 +1,7 @@ #include "primer_filter.hpp" #include +#include "surjector.hpp" +#include "hts_alignment_emitter.hpp" namespace vg { @@ -11,12 +13,13 @@ using namespace std; PrimerFinder::PrimerFinder(const unique_ptr& graph_param, const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle, const gbwtgraph::GBWTGraph& gbwt_graph_param, const gbwt::GBWT& gbwt_index_param, - const gbwt::FastLocate& r_index_param) + const gbwt::FastLocate& r_index_param, MinimizerMapper* giraffe_mapper_param) : graph(graph_param.get()), distance_index(distance_index_param), gbwt_graph(gbwt_graph_param), gbwt_index(gbwt_index_param), - r_index(r_index_param) { + r_index(r_index_param), + giraffe_mapper(giraffe_mapper_param) { load_primers(primers_file_handle); } @@ -70,6 +73,7 @@ void PrimerFinder::load_primers(ifstream& file_handle) { string chromosome_name = ""; string template_feature = ""; size_t template_position = std::numeric_limits::max(); + bool has_path = false; string line; while (getline(file_handle, line)) { @@ -80,6 +84,7 @@ void PrimerFinder::load_primers(ifstream& file_handle) { chromosome_name = ""; template_feature = ""; template_position = std::numeric_limits::max(); + has_path = false; } else if (startswith(line, "SEQUENCE_ID")) { //Get the path, path offset, and features from the sequence_id of the primer pair //This will be the same for all primer pairs up to the next "=" @@ -88,14 +93,21 @@ void PrimerFinder::load_primers(ifstream& file_handle) { chromosome_name = cur_fields[0]; template_feature = cur_fields[1] + "|" + cur_fields[2]; template_position = stoi(cur_fields[3]); + has_path = graph->has_path(chromosome_name); + if (!has_path) { + cerr << "warning: primer finder can't find a path named " << chromosome_name << " in the graph" << endl << "\tfalling back on mapping the template sequence" << endl; + } #ifdef DEBUG_PRIMER_FILTER cerr << "FIND PRIMERS FOR INPUT " << line << ": " << chromosome_name << ", " << template_feature << ", " << template_position << endl; #endif - } else if (startswith(line, "SEQUENCE_TEMPLATE")) { + } else if (startswith(line, "SEQUENCE_TEMPLATE") && !has_path) { //If the path from the sequence id isn't in the graph, then get the path and path offset by mapping the sequence string seq = split(line,'=')[1]; - //TODO: Actually do this + if (giraffe_mapper == nullptr) { + throw std::runtime_error("error: primer filter doesn't have a minimizer file to map the template"); + } + std::tie(chromosome_name, template_position) = get_graph_coordinates_from_sequence(seq); } else if (startswith(line, "PRIMER_PAIR_NUM_RETURNED")) { //How many primer pairs for this sequence template? @@ -244,6 +256,51 @@ static string get_haplotype_sequence(gbwt::size_type sequence_visit_offset, hand return haplotype; } +std::pair PrimerFinder::get_graph_coordinates_from_sequence(const string& seq) { + string ref_name; + int64_t ref_offset; + bool ref_rev; + + //Make an alignment from the sequence + Alignment aln; + aln.set_sequence(seq); + aln.set_name("primer_template"); + + //Map the alignment + vector mapped = giraffe_mapper->map(aln); + + //If there wasn't an alignment, error + if (mapped.empty()) { + throw std::runtime_error("error: Primer filter could not map template sequence"); + } + + + + //Get the reference paths we want to align to + //This is done automatically + //TODO: These are empty but they could be command line arguments + string path_file; + vector path_names; + vector> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *graph); + unordered_set reference_paths; + reference_paths.reserve(sequence_dictionary.size()); + for (auto& entry : sequence_dictionary) { + reference_paths.insert(get<0>(entry)); + } + + //Surject the alignment onto the reference paths + Surjector surjector(graph); + surjector.surject(mapped.front(), reference_paths, ref_name, ref_offset, ref_rev); + + //TODO: Double check that this is correct. idk why ref_offset is an int and not a size_t + if (ref_rev) { + ref_offset -= seq.size(); + } + + return std::make_pair(ref_name, (size_t)ref_offset); +} + + void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { const auto& sequence_visits = primer_pair.sequence_visits; diff --git a/src/primer_filter.hpp b/src/primer_filter.hpp index 6d3647a624d..42c760922cd 100644 --- a/src/primer_filter.hpp +++ b/src/primer_filter.hpp @@ -18,6 +18,7 @@ #include #include "utility.hpp" #include "snarl_distance_index.hpp" +#include "minimizer_mapper.hpp" #include "integrated_snarl_finder.hpp" #include "genotypekit.hpp" #include "traversal_finder.hpp" @@ -70,6 +71,7 @@ class PrimerFinder { unordered_map> chroms; // map containing a vector of primer pairs for each chromosome const PathPositionHandleGraph* graph; const SnarlDistanceIndex* distance_index; + MinimizerMapper* giraffe_mapper; const gbwtgraph::GBWTGraph& gbwt_graph; const gbwt::GBWT& gbwt_index; const gbwt::FastLocate& r_index; @@ -83,9 +85,10 @@ class PrimerFinder { * and pointer to SnarlDistanceIndex */ PrimerFinder(const unique_ptr& graph_param, - const SnarlDistanceIndex* distance_index_param, ifstream& primers_file_handle, + const SnarlDistanceIndex* distance_index_param, + ifstream& primers_file_handle, const gbwtgraph::GBWTGraph& gbwt_graph, const gbwt::GBWT& gbwt_index, - const gbwt::FastLocate& r_index); + const gbwt::FastLocate& r_index, MinimizerMapper* giraffe_mapper_param=nullptr); /** * Destructor @@ -127,6 +130,15 @@ class PrimerFinder { * Private functions used by public or private functions. */ + /** + * Get the graph coordinates by mapping and surjecting the template + * To be used if the chromosome_name isn't a valid path + * Returns a pair of the path/chromosome name and the offset of the template in the path + * Used in: load_primers + */ + std::pair get_graph_coordinates_from_sequence(const string& seq); + + /** * Update minimum and maximum prodcut to a primer pair object. * Used in: add_primer_pair diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index c60587882ba..5760124deb2 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -16,10 +16,11 @@ void help_primers(char** argv) { cerr << "usage: " << argv[0] << " primers [options] input.primer3 > filtered_primers.out" << endl << endl << "options:" << endl - << " -x, --xg-path FILE use this xg graph" << endl - << " -s, --snarl-index FILE use this snarl index" << endl - << " -r, --r-index FILE use this r index" << endl - << " -g, --gbz FILE use this gbz file" << endl + << " -x, --xg-path FILE use this xg graph (required)" << endl + << " -s, --snarl-index FILE use this snarl index (required)" << endl + << " -r, --r-index FILE use this r index (required)" << endl + << " -g, --gbz FILE use this gbz file (required)" << endl + << " -M, --minimizers FILE use this minimizer file for mapping the template sequence, if necessary" << endl << " -v, --variation-threshold DOUBLE output primers that work for at least this percentage of haplotypes (default: 0.8)" << endl << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl @@ -73,6 +74,7 @@ int main_primers(int argc, char** argv) { string snarl_index_path; string ri_path; string gbz_path; + string min_path; bool zero_variation = false; bool all_primers = false; int tolerance = 10; @@ -91,6 +93,7 @@ int main_primers(int argc, char** argv) { {"snarl-index", required_argument, 0, 's'}, {"ri-path", required_argument, 0, 'r'}, {"gbz-path", required_argument, 0, 'g'}, + {"minimizers", required_argument, 0, 'M'}, {"variation-threshold", required_argument, 0, 'v'}, {"tolerance", required_argument, 0, 'l'}, {"minimum-size", required_argument, 0, 'n'}, @@ -100,7 +103,7 @@ int main_primers(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:s:r:g:l:n:m:a", long_options, &option_index); + c = getopt_long (argc, argv, "hx:s:r:g:M:v:l:n:m:a", long_options, &option_index); // Detect the end of the options. if (c == -1) break; @@ -123,6 +126,10 @@ int main_primers(int argc, char** argv) { gbz_path = optarg; break; + case 'M': + min_path = optarg; + break; + case 'v': variation_threshold = parse(optarg); break; @@ -188,7 +195,13 @@ int main_primers(int argc, char** argv) { distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_path); ifstream file_handle(primers_path); - PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index); + MinimizerMapper* giraffe_mapper = nullptr; + if (!min_path.empty()) { + unique_ptr minimizer_index = vg::io::VPKG::load_one(min_path); + MinimizerMapper minimizer_mapper (gbwt_graph, *minimizer_index, &distance_index); + giraffe_mapper = &minimizer_mapper; + } + PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index, giraffe_mapper); cout << "chrom\ttplfeat\ttplpos\tlpseq\trpseq\tlppostpl\trppostmp\tlpposchrom\trpposchrom\t" << "lpnid\trpnid\tlplen\trplen\tlinsize\tminsize\tmaxsize\tvarlevel" << endl; diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index db7d5620686..2fba11944cf 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -234,6 +234,121 @@ using namespace std; }; + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); + + REQUIRE(primer_pairs.size() == left_primers_sequences.size()); + for (size_t i = 0; i < primer_pairs.size(); ++i) { + REQUIRE(left_primers_nodes_count[i] == primer_pairs[i].left_primer.mapped_nodes_ids.size()); + REQUIRE(left_primers_sequences[i] == primer_pairs[i].left_primer.sequence); + REQUIRE(left_primers_positions[i] == primer_pairs[i].left_primer.position_chromosome); + REQUIRE(left_primers_lengths[i] == primer_pairs[i].left_primer.length); + REQUIRE(right_primers_nodes_count[i] == primer_pairs[i].right_primer.mapped_nodes_ids.size()); + REQUIRE(right_primers_sequences[i] == primer_pairs[i].right_primer.sequence); + REQUIRE(right_primers_positions[i] == primer_pairs[i].right_primer.position_chromosome); + REQUIRE(right_primers_lengths[i] == primer_pairs[i].right_primer.length); + REQUIRE(linear_product_sizes[i] == primer_pairs[i].linear_product_size); + REQUIRE(min_product_sizes[i] == primer_pairs[i].min_product_size); + REQUIRE(max_product_sizes[i] == primer_pairs[i].max_product_size); + REQUIRE(abs(variation_level[i] - primer_pairs[i].variation_level) <= 0.0001); + } + + SECTION("Check that primers are assigned with correct nodes") { + vector pair_0_left_primer_nodes {27, 28}; + for (size_t i = 0; i < primer_pairs[0].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].left_primer.mapped_nodes_ids[i] == pair_0_left_primer_nodes[i]); + } + + vector pair_0_right_primer_nodes {33, 34}; + for (size_t i = 0; i < primer_pairs[0].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[0].right_primer.mapped_nodes_ids[i] == pair_0_right_primer_nodes[i]); + } + + vector pair_5_left_primer_nodes {9, 11, 12, 14, 15, 17}; + for (size_t i = 0; i < primer_pairs[5].left_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].left_primer.mapped_nodes_ids[i] == pair_5_left_primer_nodes[i]); + } + + vector pair_5_right_primer_nodes {22, 24, 25}; + for (size_t i = 0; i < primer_pairs[5].right_primer.mapped_nodes_ids.size(); i++) { + REQUIRE(primer_pairs[5].right_primer.mapped_nodes_ids[i] == pair_5_right_primer_nodes[i]); + } + } + + } + } + SECTION("template_position=11, no path name") { + string primers_path = "test/primers/y.primer3_with_ref_pos_11.nopath.out"; + ifstream file_handle(primers_path); + unique_ptr minimizer_index = vg::io::VPKG::load_one("test/primers/y.min"); + MinimizerMapper giraffe_mapper(gbwt_graph, *minimizer_index, &distance_index); + PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index, &giraffe_mapper); + + SECTION("Loads the correct number of chromosomes") { + REQUIRE(primer_finder.total_reference_paths() == 1); + } + + SECTION("Loads the correct number of primer pairs") { + REQUIRE(primer_finder.get_primer_pairs_of_chrom("y").size() == 5); + } + + SECTION("Loads and processes the primers correctly") { + primer_finder.add_primer_pair("y", 9, 14, 20, 22, 0, 20); // made up data, variation both at primers and in product + primer_finder.add_primer_pair("y", 31, 0, 15, 34, 1, 15); // made up data, no variation at primers or in product + + // Correct primer attributes + const vector left_primers_sequences { + "TGCCTGGCATAGAGGAAAGC", "GAGTCGAGGCTCAAGGACAG", "CAGAGTCGAGGCTCAAGGAC", + "GAGGCTCAAGGACAGCTCTC", "TCCAGAAGCTGCTCTTTCCC", "AGCCAGACAAATCTGGGTTC", + "CAACTGGTAGTTACT" + }; + + const vector left_primers_positions { + 362, 620, 618, 625, 819, 181, 388 + }; + + const vector left_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector left_primers_nodes_count { + 2, 1, 1, 2, 2, 6, 1 + }; + + const vector right_primers_sequences { + "GCCAGAAGAGCCTCAAGGAG", "AGGAGAGCTGGGAAAAGGGA", "AGGAGAGCTGGGAAAAGGGA", + "AGGAGAGCTGGGAAAAGGGA", "GCCTGGGTAGCTTTGGATGT", "AGATAATTAAACTGAAGTTC", + "GTTGACAATGAAAAG" + }; + + const vector right_primers_positions { + 466, 745, 745, 745, 935, 260, 485 + }; + + const vector right_primers_lengths { + 20, 20, 20, 20, 20, 20, 15 + }; + + const vector right_primers_nodes_count { + 2, 1, 1, 1, 2, 3, 1 + }; + + const vector min_product_sizes { + 124, 142, 144, 137, 136, 99, 112 + }; + + const vector max_product_sizes { + 124, 145, 147, 140, 137, 99, 112 + }; + + const vector linear_product_sizes { + 124, 145, 147, 140, 136, 99, 112 + }; + + const vector variation_level { + 1.0, 1.0, 1.0, 1.0, 1.0, 0.33333, 1.0 + }; + + const vector& primer_pairs = primer_finder.get_primer_pairs_of_chrom("y"); REQUIRE(primer_pairs.size() == left_primers_sequences.size()); @@ -278,4 +393,4 @@ using namespace std; } } } -} \ No newline at end of file +} diff --git a/test/primers/y.min b/test/primers/y.min new file mode 100644 index 0000000000000000000000000000000000000000..efe06d3137f9043739220fe605ca8b62842d6a6a GIT binary patch literal 32856 zcmeHQ4Nz3a9p8^b4mdvV01cl345uN2krX3DFKEU~(5PqzI}O2UJ5AeZJGA1U1~@d% z2>5|UGo~F4#E*eQV;ChVH8R11BSxc#!eCqZP-B7`P!uYDpxxc4vu{XW9^CVJ2j0!R zyZ89ByZ`;|fB*a6|NeL1yovKBx&Z(}$VERM!Vft6;P>H=0Sk8z|NijT^za`v!p(i* zV|n=Jfxjc2KG|ob&+?b55pIO6>P6)HdwRQ&0H}u=?L=)y{$($$Gchut^nLVpHMuZC zBOM=S{TKoxwtrl3TeD+)-Y#I!AS%^NzK!&@kjOtqK(76_op0{VKDQW!0^WaS)7$Oj z3nM6(-qGoOTkmQ5a{Hj0MExQ=Z?;^T*A85*y#8YSAy+?N?<4b*UeE*gc#6J&KhrNW z$pgi|{ts&QGAltrj{41r88w)N} z>q~J^b;Y_p6kP$|Ad2sKat)$?IC}l$`1__dHn+XS z69<&j+W)xz>6W6+v4;;Kzq_2y#-Ei0q{o1db@tT!udE4Zw2?N7z~67^Z4lF6ipRfu zM=$TfIr+#{N0pk8KO3JvX5+Ks{N?ahN^ea(fm9R7N!e9Kc-wXA-&vQC6fX*1;jEzkc^+DHAC%ZWQ`rvQzO_9?i~ z8PgHH7Wqlof7Q$J(_?Q(0v9ubhugnpux{5t6j z0ZIJE=1=eII`=FJiQ@f-jqeY}ui^#m#Z{&Q*mWzHPc8k|Z^$)Y{rN{We6`1O+MB?o zk(bZ(e>`D==x6uUUA-RnHzTcffc|+*_Ct&5v(}oY{gJ0c|1Hl#cV66^1oYPb&%Y_` zkET2IRbY&SewCA7IylbW9|K)&93Sx~iF>|oI1551bN-lqf15lHV*HyPSUJNwwI3rN zE0NoTEJQy*d7RD#W|3Aq;IsKpDPESc|c2NfQmcXYRTrM=ODng?Va(tP8 zsE>~Cj_)6v*-SdH$qd0FF8@@ufxnaMSN8WG$;r39>BhWWHOICmB8`pyReSTVwEUwUMJtH?`>t~N>08pDPJFarjD&wStLVAZhH+*X8pq3L<98zKf5U$L zW=GOT$+~na(o2jV<1U`Oj!FKYGD3u?j>0?~KT)emW3iP&d{yOdp^DAW${wT;s z`MD%G@UYJ?_1{^FDv5i|4|t7G`RTmbIp^y1H0)B(^#dE9ClmdIk=^~*Za;iqb-gAM z=p^jp{c9S#GP3>9n7N#P>UswmOuvoz^$($Yw?GdY`?s8aNm^S!Y02pr>@Hz{%cWPF z-r2lu!TqKjG^&EPKa<}R5$z{|$@^`Eb?6>0!Fc>nZzX#WNPS^JaOAK}vWhhX2`k+km;YhP?Y z)nUMMnA#so26QGX)n#}WO^PgXk{3pr#&p)h~{K@$=ci<;6e@*?;((^ipD)hY zojrCh3S7YPf0XvwXRVp$!h2e%k?^m|@#my71SI|cj*d_AQWn97f+hN=z4@P=e77*| zC=1o)4s@*^%LIBJ;czqXt$p}7gk(6x%|r=j?#S7&5od7kLT`aa&i z;`kfal(>pn#ph9whUd@d+gbZ79^d5T+q#6BkCoV6CHryG)}M~gns&8&U+B(k=-STl zVe+3v>VH~({FSR;*R5yYTXke2@~$6{zk%U9;^+Tz_^3^PO>iJ#Hwpg{o1LCAW1U*^ zH)T)l^@>CsqLTkZZTyCN{`EVz94kHL1~g{|^uJ*;zV#ko9J~B}KGJD9|17?XK0b!{ zNu}*iVlV&Huq+#EjYQv(51rqM<1gow`%Hh^>PP(loGxB}-_Sa?l(Zo>4Ab^gil5^2 zO*zTAv&`6IDaZE%dOOJYH5wdpJF*TymF6$ZJDQvezy^UoDkI{LT>VSpPvy033x%Fm zUO#3ZJ=h;6m;Ono7q30`M$)`q>?#qz+xHm(Hj3Ez|>;p%~M=5_>O|u(`j}^HB>QoiEk|6?92ELEG5u-6B?d@WT458^|c7Vq4=xVZ#r zzU6dr`@;U|>-!?x5Q^veV;k_@BBp8WcwTapXhik8J)z z*Kq7F4%NN}`!uF?=_5XysnGI2OB_KdzeIz2a)Npi05(tau_=wL3AlpyQ zqe~}%W-F(Q?F)O?U)bFihp0?`4a)wka~*BqFRLnZY4nX!4D zg+?D3@Fvfn;r}51r66nH&f@Epwj%Kn{Q%OdbFw9)<*j#rF_u#w{;RrR4tC!=X;XfBl?q<(&JMa`70I#xJ$;Q<{I) zD24!p68-bGk?z0?lsw;7Y54`=s}ep8Uyoo9w;vwm`I1WcbEanqs5bwX!?$3^cItt` zo)Z2i;dp>HJW68zHm%w_y?F@`T>b~mi!P@2Bw$Y) z{VQ_$t4$YP4|y9u1-Se%|IQ%K|Cyd<`Q9znf_){%r=AvVQ&V0G99_ZlABumqz|3|S zf?((VLZO2W_*Pp!?43V*`|pH8p#PT3&!c=llD+GvwEW(iKP1d8BQJaS1J~bT|K>@- zfB@bq*#}DbSDIc}elh}`O6yN#?^foN1_b^0aD2u46Ww~TIix-g2TAx>4j-j-vOT>hX?_CqvD5f5_uvtFgdfybas9={zhUxz8Kw9uO>YUD iS~d;lVILdq6Q$)VO-~O$aZgXf8j1P6T>gSq0N{VC+;A=c literal 0 HcmV?d00001 diff --git a/test/primers/y.primer3_with_ref_pos_11.nopath.out b/test/primers/y.primer3_with_ref_pos_11.nopath.out new file mode 100644 index 00000000000..095274bc043 --- /dev/null +++ b/test/primers/y.primer3_with_ref_pos_11.nopath.out @@ -0,0 +1,134 @@ +SEQUENCE_ID=x|gene|feature|11 +SEQUENCE_TEMPLATE=TGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTAAGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 +PRIMER_LEFT_EXPLAIN=considered 4531, GC content failed 41, low tm 3277, high tm 93, high hairpin stability 6, long poly-x seq 15, ok 1099 +PRIMER_RIGHT_EXPLAIN=considered 4530, GC content failed 40, low tm 3203, high tm 106, long poly-x seq 15, ok 1166 +PRIMER_PAIR_EXPLAIN=considered 106, unacceptable product size 101, ok 5 +PRIMER_LEFT_NUM_RETURNED=5 +PRIMER_RIGHT_NUM_RETURNED=5 +PRIMER_INTERNAL_NUM_RETURNED=0 +PRIMER_PAIR_NUM_RETURNED=5 +PRIMER_PAIR_0_PENALTY=0.214768 +PRIMER_LEFT_0_PENALTY=0.107017 +PRIMER_RIGHT_0_PENALTY=0.107752 +PRIMER_LEFT_0_SEQUENCE=TGCCTGGCATAGAGGAAAGC +PRIMER_RIGHT_0_SEQUENCE=GCCAGAAGAGCCTCAAGGAG +PRIMER_LEFT_0=351,20 +PRIMER_RIGHT_0=474,20 +PRIMER_LEFT_0_TM=60.107 +PRIMER_RIGHT_0_TM=60.108 +PRIMER_LEFT_0_GC_PERCENT=55.000 +PRIMER_RIGHT_0_GC_PERCENT=60.000 +PRIMER_LEFT_0_SELF_ANY_TH=19.30 +PRIMER_RIGHT_0_SELF_ANY_TH=0.00 +PRIMER_LEFT_0_SELF_END_TH=0.00 +PRIMER_RIGHT_0_SELF_END_TH=0.00 +PRIMER_LEFT_0_HAIRPIN_TH=31.71 +PRIMER_RIGHT_0_HAIRPIN_TH=37.39 +PRIMER_LEFT_0_END_STABILITY=3.5100 +PRIMER_RIGHT_0_END_STABILITY=3.6900 +PRIMER_PAIR_0_COMPL_ANY_TH=6.57 +PRIMER_PAIR_0_COMPL_END_TH=4.13 +PRIMER_PAIR_0_PRODUCT_SIZE=124 +PRIMER_PAIR_0_PRODUCT_TM=81.8 +PRIMER_PAIR_1_PENALTY=0.351214 +PRIMER_LEFT_1_PENALTY=0.172352 +PRIMER_RIGHT_1_PENALTY=0.178861 +PRIMER_LEFT_1_SEQUENCE=GAGTCGAGGCTCAAGGACAG +PRIMER_RIGHT_1_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_1=609,20 +PRIMER_RIGHT_1=753,20 +PRIMER_LEFT_1_TM=59.828 +PRIMER_RIGHT_1_TM=60.179 +PRIMER_LEFT_1_GC_PERCENT=60.000 +PRIMER_RIGHT_1_GC_PERCENT=55.000 +PRIMER_LEFT_1_SELF_ANY_TH=15.95 +PRIMER_RIGHT_1_SELF_ANY_TH=0.00 +PRIMER_LEFT_1_SELF_END_TH=0.00 +PRIMER_RIGHT_1_SELF_END_TH=0.00 +PRIMER_LEFT_1_HAIRPIN_TH=35.67 +PRIMER_RIGHT_1_HAIRPIN_TH=0.00 +PRIMER_LEFT_1_END_STABILITY=3.5100 +PRIMER_RIGHT_1_END_STABILITY=4.2000 +PRIMER_PAIR_1_COMPL_ANY_TH=0.00 +PRIMER_PAIR_1_COMPL_END_TH=0.00 +PRIMER_PAIR_1_PRODUCT_SIZE=145 +PRIMER_PAIR_1_PRODUCT_TM=83.5 +PRIMER_PAIR_2_PENALTY=0.351214 +PRIMER_LEFT_2_PENALTY=0.172352 +PRIMER_RIGHT_2_PENALTY=0.178861 +PRIMER_LEFT_2_SEQUENCE=CAGAGTCGAGGCTCAAGGAC +PRIMER_RIGHT_2_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_2=607,20 +PRIMER_RIGHT_2=753,20 +PRIMER_LEFT_2_TM=59.828 +PRIMER_RIGHT_2_TM=60.179 +PRIMER_LEFT_2_GC_PERCENT=60.000 +PRIMER_RIGHT_2_GC_PERCENT=55.000 +PRIMER_LEFT_2_SELF_ANY_TH=15.95 +PRIMER_RIGHT_2_SELF_ANY_TH=0.00 +PRIMER_LEFT_2_SELF_END_TH=13.47 +PRIMER_RIGHT_2_SELF_END_TH=0.00 +PRIMER_LEFT_2_HAIRPIN_TH=37.94 +PRIMER_RIGHT_2_HAIRPIN_TH=0.00 +PRIMER_LEFT_2_END_STABILITY=3.8500 +PRIMER_RIGHT_2_END_STABILITY=4.2000 +PRIMER_PAIR_2_COMPL_ANY_TH=0.00 +PRIMER_PAIR_2_COMPL_END_TH=0.00 +PRIMER_PAIR_2_PRODUCT_SIZE=147 +PRIMER_PAIR_2_PRODUCT_TM=83.6 +PRIMER_PAIR_3_PENALTY=0.354392 +PRIMER_LEFT_3_PENALTY=0.175531 +PRIMER_RIGHT_3_PENALTY=0.178861 +PRIMER_LEFT_3_SEQUENCE=GAGGCTCAAGGACAGCTCTC +PRIMER_RIGHT_3_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_3=614,20 +PRIMER_RIGHT_3=753,20 +PRIMER_LEFT_3_TM=59.824 +PRIMER_RIGHT_3_TM=60.179 +PRIMER_LEFT_3_GC_PERCENT=60.000 +PRIMER_RIGHT_3_GC_PERCENT=55.000 +PRIMER_LEFT_3_SELF_ANY_TH=10.25 +PRIMER_RIGHT_3_SELF_ANY_TH=0.00 +PRIMER_LEFT_3_SELF_END_TH=0.00 +PRIMER_RIGHT_3_SELF_END_TH=0.00 +PRIMER_LEFT_3_HAIRPIN_TH=37.05 +PRIMER_RIGHT_3_HAIRPIN_TH=0.00 +PRIMER_LEFT_3_END_STABILITY=3.2000 +PRIMER_RIGHT_3_END_STABILITY=4.2000 +PRIMER_PAIR_3_COMPL_ANY_TH=26.57 +PRIMER_PAIR_3_COMPL_END_TH=26.57 +PRIMER_PAIR_3_PRODUCT_SIZE=140 +PRIMER_PAIR_3_PRODUCT_TM=83.2 +PRIMER_PAIR_4_PENALTY=0.360353 +PRIMER_LEFT_4_PENALTY=0.326264 +PRIMER_RIGHT_4_PENALTY=0.034089 +PRIMER_LEFT_4_SEQUENCE=TCCAGAAGCTGCTCTTTCCC +PRIMER_RIGHT_4_SEQUENCE=GCCTGGGTAGCTTTGGATGT +PRIMER_LEFT_4=808,20 +PRIMER_RIGHT_4=943,20 +PRIMER_LEFT_4_TM=59.674 +PRIMER_RIGHT_4_TM=60.034 +PRIMER_LEFT_4_GC_PERCENT=55.000 +PRIMER_RIGHT_4_GC_PERCENT=55.000 +PRIMER_LEFT_4_SELF_ANY_TH=1.00 +PRIMER_RIGHT_4_SELF_ANY_TH=0.00 +PRIMER_LEFT_4_SELF_END_TH=0.00 +PRIMER_RIGHT_4_SELF_END_TH=0.00 +PRIMER_LEFT_4_HAIRPIN_TH=34.56 +PRIMER_RIGHT_4_HAIRPIN_TH=0.00 +PRIMER_LEFT_4_END_STABILITY=3.9700 +PRIMER_RIGHT_4_END_STABILITY=3.0600 +PRIMER_PAIR_4_COMPL_ANY_TH=13.72 +PRIMER_PAIR_4_COMPL_END_TH=10.53 +PRIMER_PAIR_4_PRODUCT_SIZE=136 +PRIMER_PAIR_4_PRODUCT_TM=83.6 += From a26be9f30fdb90888ce45ad160ed9881164434ba Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 6 Jan 2025 00:56:24 -0800 Subject: [PATCH 30/43] Check more ways of being wrong --- src/primer_filter.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 06de657a342..81698176d0c 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -90,12 +90,19 @@ void PrimerFinder::load_primers(ifstream& file_handle) { //This will be the same for all primer pairs up to the next "=" vector cur_fields = move(split(split(line,'=')[1], '|')); - chromosome_name = cur_fields[0]; - template_feature = cur_fields[1] + "|" + cur_fields[2]; - template_position = stoi(cur_fields[3]); - has_path = graph->has_path(chromosome_name); - if (!has_path) { - cerr << "warning: primer finder can't find a path named " << chromosome_name << " in the graph" << endl << "\tfalling back on mapping the template sequence" << endl; + if (cur_fields.size() == 4) { + //If the sequence id is correctly formatted + chromosome_name = cur_fields[0]; + template_feature = cur_fields[1] + "|" + cur_fields[2]; + template_position = stoi(cur_fields[3]); + has_path = graph->has_path(chromosome_name); + if (!has_path) { + cerr << "warning: primer finder can't find a path named " << chromosome_name << " in the graph" << endl << "\tfalling back on mapping the template sequence" << endl; + } + } else { + template_feature = line; + has_path = false; + cerr << "warning: primer finder " << line << " is not formatted with a path and offset" << endl << "\tfalling back on mapping the template sequence" << endl; } #ifdef DEBUG_PRIMER_FILTER cerr << "FIND PRIMERS FOR INPUT " << line << ": " << chromosome_name << ", " << template_feature << ", " << template_position << endl; @@ -270,12 +277,11 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons vector mapped = giraffe_mapper->map(aln); //If there wasn't an alignment, error - if (mapped.empty()) { + if (mapped.empty() && mapped.front().mapping_quality() == 0) { throw std::runtime_error("error: Primer filter could not map template sequence"); } - //Get the reference paths we want to align to //This is done automatically //TODO: These are empty but they could be command line arguments @@ -296,6 +302,9 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons if (ref_rev) { ref_offset -= seq.size(); } +#ifdef DEBUG_PRIMER_FILTER + cerr << "\tmapped sequence to " << ref_name << " at offset " << ref_offset << endl; +#endif return std::make_pair(ref_name, (size_t)ref_offset); } From c4e9e68d7ee1f2ee45ab11b585d44859601f6c6b Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 6 Jan 2025 01:12:31 -0800 Subject: [PATCH 31/43] Check that the graph has the path surject used --- src/primer_filter.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 81698176d0c..f0940231029 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -274,7 +274,7 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons aln.set_name("primer_template"); //Map the alignment - vector mapped = giraffe_mapper->map(aln); + vector mapped = giraffe_mapper->map_from_chains(aln); //If there wasn't an alignment, error if (mapped.empty() && mapped.front().mapping_quality() == 0) { @@ -302,6 +302,7 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons if (ref_rev) { ref_offset -= seq.size(); } + assert (graph->has_path(ref_name)); #ifdef DEBUG_PRIMER_FILTER cerr << "\tmapped sequence to " << ref_name << " at offset " << ref_offset << endl; #endif From 1c5fb832c7ff5a96cd01551529db1853e9a9e555 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 6 Jan 2025 01:35:59 -0800 Subject: [PATCH 32/43] Use zipcodes and long read code path (kind of) but I havne't merged the real long read branch yet --- src/primer_filter.cpp | 2 +- src/subcommand/primers_main.cpp | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index f0940231029..45837bc64b1 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -274,7 +274,7 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons aln.set_name("primer_template"); //Map the alignment - vector mapped = giraffe_mapper->map_from_chains(aln); + vector mapped = giraffe_mapper->map(aln); //If there wasn't an alignment, error if (mapped.empty() && mapped.front().mapping_quality() == 0) { diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 5760124deb2..6c1e41e8059 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -21,6 +21,7 @@ void help_primers(char** argv) { << " -r, --r-index FILE use this r index (required)" << endl << " -g, --gbz FILE use this gbz file (required)" << endl << " -M, --minimizers FILE use this minimizer file for mapping the template sequence, if necessary" << endl + << " -Z, --zipcodes FILE use this zipcode file for mapping the template sequence, if necessary" << endl << " -v, --variation-threshold DOUBLE output primers that work for at least this percentage of haplotypes (default: 0.8)" << endl << " -l, --tolerance INT allow this much difference between minimum and maximum sizes compared to the linear product size (default: 10)" << endl << " -n, --minimum-size INT minimum product size allowed (has precedence over --tolerance)" << endl @@ -75,6 +76,7 @@ int main_primers(int argc, char** argv) { string ri_path; string gbz_path; string min_path; + string zip_path; bool zero_variation = false; bool all_primers = false; int tolerance = 10; @@ -94,6 +96,7 @@ int main_primers(int argc, char** argv) { {"ri-path", required_argument, 0, 'r'}, {"gbz-path", required_argument, 0, 'g'}, {"minimizers", required_argument, 0, 'M'}, + {"zipcodes", required_argument, 0, 'Z'}, {"variation-threshold", required_argument, 0, 'v'}, {"tolerance", required_argument, 0, 'l'}, {"minimum-size", required_argument, 0, 'n'}, @@ -103,7 +106,7 @@ int main_primers(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:s:r:g:M:v:l:n:m:a", long_options, &option_index); + c = getopt_long (argc, argv, "hx:s:r:g:M:Z:v:l:n:m:a", long_options, &option_index); // Detect the end of the options. if (c == -1) break; @@ -130,6 +133,10 @@ int main_primers(int argc, char** argv) { min_path = optarg; break; + case 'Z': + zip_path = optarg; + break; + case 'v': variation_threshold = parse(optarg); break; @@ -198,7 +205,17 @@ int main_primers(int argc, char** argv) { MinimizerMapper* giraffe_mapper = nullptr; if (!min_path.empty()) { unique_ptr minimizer_index = vg::io::VPKG::load_one(min_path); - MinimizerMapper minimizer_mapper (gbwt_graph, *minimizer_index, &distance_index); + ZipCodeCollection zipcodes; + if (!zip_path.empty()) { + ifstream zip_in (zip_path); + zipcodes.deserialize(zip_in); + zip_in.close(); + } + + MinimizerMapper minimizer_mapper (gbwt_graph, *minimizer_index, &distance_index, &zipcodes); + //Set parameters + //TODO: I'm not actually sure about this because the sequence is long but it should match a path exactly so it should get the whole alignment at gapless extension + minimizer_mapper.align_from_chains = true; giraffe_mapper = &minimizer_mapper; } PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index, giraffe_mapper); From 58f4529114fbe96d6f05e191806c825a64bd9891 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Mon, 6 Jan 2025 09:29:51 -0800 Subject: [PATCH 33/43] Use zipcodes in primer unit test --- src/unittest/primer_filter.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 2fba11944cf..9198991d7e7 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -280,7 +280,11 @@ using namespace std; string primers_path = "test/primers/y.primer3_with_ref_pos_11.nopath.out"; ifstream file_handle(primers_path); unique_ptr minimizer_index = vg::io::VPKG::load_one("test/primers/y.min"); - MinimizerMapper giraffe_mapper(gbwt_graph, *minimizer_index, &distance_index); + ZipCodeCollection oversized_zipcodes; + ifstream zip_in ("test/primers/y.zipcodes"); + oversized_zipcodes.deserialize(zip_in); + zip_in.close(); + MinimizerMapper giraffe_mapper(gbwt_graph, *minimizer_index, &distance_index, &oversized_zipcodes); PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index, &giraffe_mapper); SECTION("Loads the correct number of chromosomes") { From 662aa70533d914f84e3996d16425692fb7135c92 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Thu, 9 Jan 2025 02:25:18 -0800 Subject: [PATCH 34/43] add mor checks for bad alignments --- src/primer_filter.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/primer_filter.cpp b/src/primer_filter.cpp index 45837bc64b1..19bdc912731 100644 --- a/src/primer_filter.cpp +++ b/src/primer_filter.cpp @@ -277,8 +277,9 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons vector mapped = giraffe_mapper->map(aln); //If there wasn't an alignment, error - if (mapped.empty() && mapped.front().mapping_quality() == 0) { - throw std::runtime_error("error: Primer filter could not map template sequence"); + if (mapped.empty() || mapped.front().mapping_quality() < 30) { + cerr << "error: Primer filter could not map template sequence " << seq << endl; + return std::make_pair(ref_name, std::numeric_limits::max()); } @@ -312,6 +313,9 @@ std::pair PrimerFinder::get_graph_coordinates_from_sequence(cons void PrimerFinder::update_min_max_product_size(PrimerPair& primer_pair) { + if (primer_pair.chromosome_name.empty()) { + return; + } const auto& sequence_visits = primer_pair.sequence_visits; handle_t start_handle = gbwt_graph.get_handle(primer_pair.left_primer.mapped_nodes_ids.front()); @@ -343,6 +347,9 @@ void PrimerFinder::map_to_nodes(Primer& primer, const string& path_name) { #ifdef DEBUG_PRIMER_FILTER cerr << "Map to nodes for primer " << primer.sequence << endl; #endif + if (path_name.empty()) { + return; + } path_handle_t reference_path_handle = graph->get_path_handle(path_name); string primer_seq; if (primer.left) { @@ -454,6 +461,9 @@ void PrimerFinder::update_variation(PrimerPair& primer_pair, const string& path_ #ifdef DEBUG_PRIMER_FILTER cerr << "Update variation" << endl; #endif + if (path_name.empty()) { + return; + } const vector& left_primer_node_ids = primer_pair.left_primer.mapped_nodes_ids; const vector& right_primer_node_ids = primer_pair.right_primer.mapped_nodes_ids; vector nodes_id; From e811a4c23ace12a7e2dadd4f3411e2438f379c27 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Sun, 19 Jan 2025 04:14:06 -0800 Subject: [PATCH 35/43] Rename snarl index as distance index to avoid confusion --- src/subcommand/primers_main.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 6c1e41e8059..ed29fd76da9 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -17,7 +17,7 @@ void help_primers(char** argv) { << endl << "options:" << endl << " -x, --xg-path FILE use this xg graph (required)" << endl - << " -s, --snarl-index FILE use this snarl index (required)" << endl + << " -d, --dist-index FILE use this distance index (required)" << endl << " -r, --r-index FILE use this r index (required)" << endl << " -g, --gbz FILE use this gbz file (required)" << endl << " -M, --minimizers FILE use this minimizer file for mapping the template sequence, if necessary" << endl @@ -72,7 +72,7 @@ int main_primers(int argc, char** argv) { } string xg_path; - string snarl_index_path; + string distance_index_path; string ri_path; string gbz_path; string min_path; @@ -92,7 +92,7 @@ int main_primers(int argc, char** argv) { { {"help", no_argument, 0, 'h'}, {"xg-path", required_argument, 0, 'x'}, - {"snarl-index", required_argument, 0, 's'}, + {"dist-index", required_argument, 0, 'd'}, {"ri-path", required_argument, 0, 'r'}, {"gbz-path", required_argument, 0, 'g'}, {"minimizers", required_argument, 0, 'M'}, @@ -117,8 +117,8 @@ int main_primers(int argc, char** argv) { xg_path = optarg; break; - case 's': - snarl_index_path = optarg; + case 'd': + distance_index_path = optarg; break; case 'r': @@ -173,8 +173,8 @@ int main_primers(int argc, char** argv) { exit(1); } - if (snarl_index_path.empty()) { - cerr << "error:[vg primers] snarl index file (-s) is required" << endl; + if (distance_index_path.empty()) { + cerr << "error:[vg primers] distance index file (-d) is required" << endl; exit(1); } @@ -199,7 +199,7 @@ int main_primers(int argc, char** argv) { load_gbz(gbwt_index, gbwt_graph, gbz_path); gbwt_graph.set_gbwt(gbwt_index); r_index.setGBWT(gbwt_index); - distance_index.deserialize(snarl_index_path); + distance_index.deserialize(distance_index_path); graph = vg::io::VPKG::load_one(xg_path); ifstream file_handle(primers_path); MinimizerMapper* giraffe_mapper = nullptr; From 645f5c7af4fbc3f517a94997bc445bec562bc35d Mon Sep 17 00:00:00 2001 From: Xian Date: Sun, 19 Jan 2025 22:28:43 +0100 Subject: [PATCH 36/43] Make minimizer index accessible to giraffe --- src/subcommand/primers_main.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index ed29fd76da9..47bfe3f0fd2 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -106,7 +106,7 @@ int main_primers(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:s:r:g:M:Z:v:l:n:m:a", long_options, &option_index); + c = getopt_long (argc, argv, "hx:d:r:g:M:Z:v:l:n:m:a", long_options, &option_index); // Detect the end of the options. if (c == -1) break; @@ -203,9 +203,10 @@ int main_primers(int argc, char** argv) { graph = vg::io::VPKG::load_one(xg_path); ifstream file_handle(primers_path); MinimizerMapper* giraffe_mapper = nullptr; + unique_ptr minimizer_index; + ZipCodeCollection zipcodes; if (!min_path.empty()) { - unique_ptr minimizer_index = vg::io::VPKG::load_one(min_path); - ZipCodeCollection zipcodes; + minimizer_index = vg::io::VPKG::load_one(min_path); if (!zip_path.empty()) { ifstream zip_in (zip_path); zipcodes.deserialize(zip_in); From 9f8cb5f1f8afa50a4d14b2db13fea2ddce7328fb Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 00:12:56 +0100 Subject: [PATCH 37/43] Make sure the minimizer mapper sticks around --- src/subcommand/primers_main.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/subcommand/primers_main.cpp b/src/subcommand/primers_main.cpp index 47bfe3f0fd2..4b33a952f51 100644 --- a/src/subcommand/primers_main.cpp +++ b/src/subcommand/primers_main.cpp @@ -212,8 +212,11 @@ int main_primers(int argc, char** argv) { zipcodes.deserialize(zip_in); zip_in.close(); } - - MinimizerMapper minimizer_mapper (gbwt_graph, *minimizer_index, &distance_index, &zipcodes); + } + //The minimizer mapper needs to be declared here to keep it around in memory + //So sometimes make it with empty indexes but only keep the pointer to it if we had minimizers + MinimizerMapper minimizer_mapper (gbwt_graph, *minimizer_index, &distance_index, &zipcodes); + if (!min_path.empty()) { //Set parameters //TODO: I'm not actually sure about this because the sequence is long but it should match a path exactly so it should get the whole alignment at gapless extension minimizer_mapper.align_from_chains = true; From ec78bdfa95bbe98086e05a5ea6151c297bd01d00 Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 09:41:35 +0100 Subject: [PATCH 38/43] Move wiki page to wiki --- doc/primer_filter.md | 64 -------------------------------------------- doc/wiki | 2 +- 2 files changed, 1 insertion(+), 65 deletions(-) delete mode 100644 doc/primer_filter.md diff --git a/doc/primer_filter.md b/doc/primer_filter.md deleted file mode 100644 index 110655ac01d..00000000000 --- a/doc/primer_filter.md +++ /dev/null @@ -1,64 +0,0 @@ - -`vg primers` can be used to filter PRC primers based on properties of the pangenome such as whether there are variations in the primers and the possible lengths of the PRC product. -`vg primers` takes pangenome indexes and the output of `primer3` as input and outputs a `.tsv` file of the input primers and properties from the pangenome. - -# Get primers with `primer3` - -The input to `vg primers` is the output of the command line version of [`primer3`](https://github.com/primer3-org/primer3). - -`primer3` requires a config file formatted like: -``` -SEQUENCE_ID=CHM13#0#chr17|BRCA1P1|exon_1|44026826 -SEQUENCE_TEMPLATE=CATGT... -PRIMER_NUM_RETURN=10 -PRIMER_TASK=generic -PRIMER_PICK_LEFT_PRIMER=1 -PRIMER_PICK_INTERNAL_OLIGO=0 -PRIMER_PICK_RIGHT_PRIMER=1 -PRIMER_OPT_SIZE=20 -PRIMER_MIN_SIZE=18 -PRIMER_MAX_SIZE=22 -PRIMER_PRODUCT_SIZE_RANGE=75-150 -PRIMER_EXPLAIN_FLAG=1 -= -``` - -The `SEQUENCE_ID` field must be formatted correctly for `vg primers` to find the correct location of the primers in the pangenome. -There are four fields in `SEQUENCE_ID` separated by `|`. -They are the reference path name in the graph, the name of the gene, the exon or intron, and the offset of the sequence in the path. -The names of the reference paths in the graph can be found with `vg paths -L -R`. -The `SEQUENCE_TEMPLATE` is the nucleotide sequence the primers are found from. - -# Filtering primers with `vg primers` - -`vg primers` requires the following indexes of the pangenome: - -- the *xg* index created with `vg index -x` -- the *distance*/*snarl* index created with `vg index -j` -- the [*r-index*](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) created with `vg gbwt -r` -- the [*gbz*](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) created with `vg gbwt` - -# Interpreting output of `vg primers` - -`vg primers` outputs a tsv file with the following fields for each primer: - -| field | definition | description | -| ---------- | ---------- | ----------- | -| chrom | chromosome | reference path name, the first field in `SEQUENCE_ID` | -| tplfeat | template feature | the second and third fields in `SEQUENCE_ID` | -| tplpos | template position | offset along the reference path, the fourth field in `SEQUENCE_ID` | -| lpseq | left primer sequence | the nucleotide sequence of the left primer | -| rpseq | right primer sequence | the nucleotide sequence of the right primer | -| lppostpl | left primer position template | the offset of the left primer in the template sequence | -| rppostmp | right primer position template | the offset of the right primer in the template sequence | -| lpposchrom | left primer position chromosome | the offset of the left primer in the reference path | -| rpposchrom | right primer position chromosome | the offset of the right primer in the reference path | -| pnid | left primer mapped node ids | the node ids that the left primer overlaps in the graph | -| rpnid | right primer mapped node ids | the node ids that the right primer overlaps in the graph | -| lplen | left primer length | the length in nucleotides of the left primer | -| rplen | right primer length | the length in nucleotides of the right primer | -| linsize | linear product size | the length of the product in the linear genome (including primer lengths) | -| minsize | minimum product size | the minimum length of the product according to the pangenome | -| maxsize | maximum product size | the maximum length of the product according to the pangenome | -| varlevel | variation level | a measure of variation in the primers (the number of unique haplotypes / the total number of haplotypes) | - diff --git a/doc/wiki b/doc/wiki index 0e573e62851..143d916e1f2 160000 --- a/doc/wiki +++ b/doc/wiki @@ -1 +1 @@ -Subproject commit 0e573e6285197cbb2d0a119fc067b2c15e056783 +Subproject commit 143d916e1f286d3b7f7c5c327eb43dd13835cab7 From b6493ef6fa7a58fa3828789fab26a532a5d3ff76 Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 09:58:03 +0100 Subject: [PATCH 39/43] Update wiki --- doc/wiki | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/wiki b/doc/wiki index 143d916e1f2..17836a40ad7 160000 --- a/doc/wiki +++ b/doc/wiki @@ -1 +1 @@ -Subproject commit 143d916e1f286d3b7f7c5c327eb43dd13835cab7 +Subproject commit 17836a40ad78b7d4d997cb711c32873201a2e673 From 1e911f9918a95a6900d331ee413777f76d6c458f Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 10:16:39 +0100 Subject: [PATCH 40/43] Fix integration test for vg primers --- test/t/{55_vg_primers.t => 56_vg_primers.t} | 30 ++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) rename test/t/{55_vg_primers.t => 56_vg_primers.t} (69%) diff --git a/test/t/55_vg_primers.t b/test/t/56_vg_primers.t similarity index 69% rename from test/t/55_vg_primers.t rename to test/t/56_vg_primers.t index 717b8dbc0ec..d2925b960d8 100644 --- a/test/t/55_vg_primers.t +++ b/test/t/56_vg_primers.t @@ -14,24 +14,24 @@ vg convert -x y.vg > y.xg vg index -j y.dist y.vg -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz | wc -l) 6 "Get the expected number of primer pairs" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -a | wc -l) 6 "Get the expected number of primer pairs using --all-primers tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -l 2 | wc -l) 3 "Get the expected number of primer pairs using --tolerance tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" -is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -m 140 | wc -l) 4 "Get the expected number of primer pairs using --maximum-size tag" - -is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz | wc -l) 9 "Get the expected number of primer pairs" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -a | wc -l) 11 "Get the expected number of primer pairs using --all-primers tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -l 2 | wc -l) 6 "Get the expected number of primer pairs using --tolerance tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" -is $(vg primers primers/y.split.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz -m 140 | wc -l) 7 "Get the expected number of primer pairs using --maximum-size tag" - -vg primers primers/y.primer3_with_ref_pos.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz> y.ref_pos_0.out -vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -s y.dist -r primers/y.ri -g primers/y.giraffe.gbz > y.ref_pos_11.out +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz | wc -l) 6 "Get the expected number of primer pairs" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -a | wc -l) 6 "Get the expected number of primer pairs using --all-primers tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -l 2 | wc -l) 3 "Get the expected number of primer pairs using --tolerance tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" +is $(vg primers primers/y.primer3_with_ref_pos.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -m 140 | wc -l) 4 "Get the expected number of primer pairs using --maximum-size tag" + +is $(vg primers primers/y.split.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz | wc -l) 9 "Get the expected number of primer pairs" +is $(vg primers primers/y.split.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -a | wc -l) 11 "Get the expected number of primer pairs using --all-primers tag" +is $(vg primers primers/y.split.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -l 2 | wc -l) 6 "Get the expected number of primer pairs using --tolerance tag" +is $(vg primers primers/y.split.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -n 137 | wc -l) 4 "Get the expected number of primer pairs using --minimum-size tag" +is $(vg primers primers/y.split.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz -m 140 | wc -l) 7 "Get the expected number of primer pairs using --maximum-size tag" + +vg primers primers/y.primer3_with_ref_pos.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz> y.ref_pos_0.out +vg primers primers/y.primer3_with_ref_pos_11.out -x y.xg -d y.dist -r primers/y.ri -g primers/y.giraffe.gbz > y.ref_pos_11.out diff -q <(awk '{$2=$3=$6=$7=""; print $0}' y.ref_pos_0.out) <(awk '{$2=$3=$6=$7=""; print $0}' y.ref_pos_11.out) > diff_0_11 is $(cat diff_0_11 | wc -l) 0 "These two output files should have identical primers except for their positions on template" # clean up rm diff_0_11 rm y.vg y.xg y.dist -rm y.ref_pos_0.out y.ref_pos_11.out \ No newline at end of file +rm y.ref_pos_0.out y.ref_pos_11.out From 4b8f33f5a7ba009ddd1271ebc0ce45f2603d0a54 Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 10:52:55 +0100 Subject: [PATCH 41/43] Fix primer test files --- test/primers/index.min | Bin 32856 -> 0 bytes test/primers/y.min | Bin 32856 -> 32856 bytes test/primers/y.zipcodes | Bin 0 -> 98 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test/primers/index.min create mode 100644 test/primers/y.zipcodes diff --git a/test/primers/index.min b/test/primers/index.min deleted file mode 100644 index 67273dc2214ddba5ccbddc9b002c1e03d683315b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32856 zcmeGl3s6+o^?nvv*yX!?BzyxHP>En9MTN))%(w{}70sATLe#c1?WApnRvi4nqGcxh zgnZ1{v>J$?fvPb~Bq=pA!J?y4ql5E1Tml}RQv$F_de|1p?Q6cQKXR6zCI$W_Q zGX8^rr2ZK1_$v+n+4SeD^w%A+_5dR<)6d0}zHLmuPI&v{%ev^Swq_UXQ}N^Yrx^>j zMIS$kJn!>zHvi}ueX!E}F)f;x^SLPwO^)RHCz0NkleaOL{|3J3*o!yiAV)31hrFo$ z$>wK2ax)I>&*rU$h?KjL*dt^ZeNHg?Nb;vE{M`II3*vxt9>re3e>C&gwglak;4}=? z7S>l1=X3g)`-huyJU`jgxe{o-hK*nP^S?mwjPX`W?>nhTvx$!n4|?lH$A^6T<*Aw( z`Rb%~*zsH5{#e>a{l<>?-F0(;%0l}TT<(l&k0?T(BKBX+2K?xhyJ5h=FpR!YME>qs z-+q-lex>T`y3-SmFPjCmV*Y{9fT;a1)xN!)Q9zV_Q*x5~x-L#ff#H1ou=#yx{wi7C zR#L4$j2(Ay`P|KZO62op`QulLKbi2^9+$*dfI|cCpXvW%rvH)i8(qDc!EH#b9;SbO zBKx6P|4~crqu$6xWc*fSpnG!;CIFr3$Klrp|4x7JWHs;;(XUeUp_AjHLs8Jt!ub*J zi$8Fx{sIV`&EaGE{be@)N%}X*zE$&0^ZGGz=kmks)9=YM(f{*K>82es`2io1asm8K zh(+K(6xF%j_s#iOC&GUz`b2sb! zr093Ar?e%rU?p}Fp-%0R+iWnp|(ZvWPJK=*QZJG(`MCOmFS1$CH8}6V0 zE5%2o;8(gl%JH)`2b-=zHw*pOsh=92DG&8V{uav5RUv?f-N$MC&R9@I!fT$ub%M%I z=k3nL*OOOZhm%}Cu=$w=+H62SIQ{T__08HapcS!?4;40at<3a9eoHv~kI~y-F#RTJ zpDI26lx|-b-px>B;rNzLX#>v|C#)baQ#ea z_>^w{=*npsFMO4SyzJ|LWYGWbOKJKm-TvOB6;C!tYz3}X;%~~v7KiZqIN)lbfBpL# zY=s(tc0N9jP&Tb5*9g`}nf(zWZhr{&-91tJF1~I~J*o)?F5}exkSkxs?b^&+H&UIoWewW<#<<@86mp^BMAR(y2*7w-_oy_KM+3}S~<^cT_pxIphnE#VY zp5vwculFlw@4K|Z8F-5Luc@CIo1cx$K%Nh&{UOEwOC_83XHGeYe3x_jkEMO~QA@fZ ze?SdYBJouz`0VA30;2JMTc=ObI+nqQ0z}5Awc%fsbU%N^xwnw|f?%6aw~DP#ygo7F z$5i=|5EGm@_GCiGHclTV|Emf9V~~H@b!Yn9 zxhG~Kx0A!#AEW#aYwHhN@BcX=fP|ey;z!IoJ#WBTrQ~n!fx4TO@iR z$su>cjsd8U|H7i9Nm&5ADZm#%WP3W5Z~FKMko2!hW6x)|?hJx1CXOGokJgj=HV{9s zI{yU#ME$3dqSG%VEbYaPBKbScJAYodJi{5m*_^(kiBH%%{spSAwheo}gK!nckNN); z$rJ!ccr%d57}6+YAJ{s5!4Sdk;mMJu<1Cfh0Z47*d-&pz&a^bUfeN+*#%Y9l0_7=&n zS{qWQy&IZJ@~=7ln0+(C{Ph_ty5~18?uS4`e@WqgTC>?${Z0aQ{S)usJU_H?&7N&* zno5!CTV8I?U)Y~q+!xl0P%PIUOg~PL{FS!gF1kenPZFU|^t#%*yo6=g%SP8J9P8ADbUAC@XO^d%nr`Zs(%s$^l|)| ze?AmHmfOCy>r>;NIFJZ@&E@MHMV0!GWc@$xetLcQu~Oh8vOW@;TITh|`~(mj%jv`H z*Nx16#bo=*C3N*HQ0?I5=JtiP@fUWt#UQNE{x)mlC%6AfVS^>~u`qrc6w$IZFZ>{M zisJIc?8Bk>*NNQN!Sd}G`K9x6bNyC4^D{%|z*-!dl>EFpjE6UUFO z?-Y~u$`PCoraFJW^i|hLHf!i`_C`}*ER(S=>W!AAEdC`})G`-}pj^S@mBq&hxvrYt%Xx_WW>8%=(P-2UzLdllF({>S zK-~@}iP*2t-&{yZD0)8gI)Q<}L4<$#`!nehB1HD}TR{May-V@W*2{C9XJ(KHA=KLA zgTpv0JlZ<{l!ov2uG?Pj34elNbdc6d9j)Bk2x2U(xez<@TQ!k{kDaNHKyv zoImWSmzM}X$KZTPx$xQRGYTl?|4Y#~Z}%=5fx<2#@h9PUfL1(7#D7byaZBsp837)t z}q@i!^_PtXI>(iUNOgTrq=K3dmrX=v4VBeg>Kk^GPKUb7=1GV%T&ru~zt;p_x| z>}93(S1I`@F|vOc1DY5<|FQM&q5R^DTM`oo`i*F^NV!z|QsrIkr3YS}TZ>&R%#V<2 z|6wop-nnJDI?e>$WBB-E@r9xNV~3lf?=O4qID{7N4`J3%c^~u;v45@v(|-U+2dY(E zzD5&YP?~?YoOd{SQuZCYG3Jc;qM{uVzSkq3BZ5j zwg#kM3U1|BScKht# ze`n+a-M3tR>L?9Xlgs}9k)ZeX4{=M%{|3Ns1;L=>*F3&?PB0*Vn?m-1wdF%teu@NI zh1Q?Q-mUbx^$2<&;Pf@$pXk(!4S^?Pu)j!rRf;~#g#bT<3wis@zI#aiKTC=}a?8o~ z^n!$CanRjL^T(1SC+HD=P*=_M7n}c15q{hE$4H*BG`n3N;9&P>}Fgz?MgB7y?cSbu;ovGxx_BS@zHV+=HYE%0ncP zRGeGJ!nQ&M}`eX&LhclFqEsuV&`s zf2X9g(Ya85BPG#Em&gRJ^teb*5F*q9TYV@;2&{=O8$P;SvwIu5T2hm5V)*|e1pGZj zWy2531M_c6cJkxwrt^fF6e2{Ce%mC#FW?tK0Q4uY{_q{2DcQX~TV0kR2JC~!BKRkK zmp|NJWa~f3KeFX-R_lgMJ^5PUHQMeW|L{HE{G+oyrET}weiJ$Go){7YDsPp@V17vB z#il>y&5tG)-mMYJj$-&-6~_R6>%)%?-(bs=wZ*C9+jItUfbI_+{}8IVBA>F6FKz_K?%#ua z0?Q{@e8!t5&r}>ct`?5D1?!0~-HPJO{5#nDXRAN^&WG0iSANJd}5 zTNPhE)YoTgfBdXfa9}>Vbeo;-6I$QDsd%wxky1%%vx>q9zU>nbf2Ikt;qmuz zw0&mov?Yb?!2GB01WJhC(Rf6<{)PMfpFaBOV)|zj`WOoqe|EtBOU4Neclm|7f3-r} z!G_Ce@tN!Rkt=r({c*6fRQx_oI~1S!QLsH+{0m2!^qJ{} z`68hm8$ZHme@csuIwsOf^;DSlkL&z|{`1>DjW3;hNtvlus9wYLyElCHR{{U{%|DQD zWXsq6cwnnVx_i3;vp-MpeBj`q%6a1Hk$h#C`cJ^`$mTc5ALQ>sAY$jYz<=OB@PF&{ z|6!NB_VCY*YPAFNuYQl(5iap1un&Rl1NnTe`v{#vaXltK4$aR+Up0tMSf(kd8H;cB z2~-UK?j9P+uU@%csA$K^VSUxR1_AmJSU&(Cz{iEan*Oy;tbBHEagK_;hS|@{QF_My zu<=JO&yg-GI~7SGI$vsW^^*r3%2|WRK&w38o*$Lg2u*DdbR~i~41ak34eYxI0XF@E za)iK|_6_g>d|U`bEI*uyxz0yZlcidoj-OoDZ!*o%(EI0vASq1G?+6y(*zR+N?3RX9 z@c`!E+}gbL^Ow7vmoj?FNX?5_`~TzlC-oC|+Uq~>r8Ftn^_P6-dj^Xi)JF*L)_(Wh z@6WDybDt?5&ad+N{CEcV&u{*Nd;`nZd@tdYLo!ck3b66z3HE;gzJc)td;lL80=)Gv zfZxDx;5XLB`u31M{qHo%8tdC^*WY)z`8Qcjvxxc@^M5{A|N0bL`|`Fc zA01h~QW`DA&i}f&{@=f>m^@WgJ9i@~z8~w~CdN0|`p;H=(tf|rs+K3kR$={v=Ucrh zFnJzHFo*a8!{@Kfi%+r1XQtZGZEu<^(QI)2m92fQmkXMbn;Nedni8C}$k6kDoA~?F uCEv$pUme>~5U0ZA&!hEM8ogmk9U}a=&M)XcLICDJ-Vwn+!*_l#cl!@UC`Vub literal 32856 zcmeHQ4Nz3a9p8^b4mdvV01cl345uN2krX3DFKEU~(5PqzI}O2UJ5AeZJGA1U1~@d% z2>5|UGo~F4#E*eQV;ChVH8R11BSxc#!eCqZP-B7`P!uYDpxxc4vu{XW9^CVJ2j0!R zyZ89ByZ`;|fB*a6|NeL1yovKBx&Z(}$VERM!Vft6;P>H=0Sk8z|NijT^za`v!p(i* zV|n=Jfxjc2KG|ob&+?b55pIO6>P6)HdwRQ&0H}u=?L=)y{$($$Gchut^nLVpHMuZC zBOM=S{TKoxwtrl3TeD+)-Y#I!AS%^NzK!&@kjOtqK(76_op0{VKDQW!0^WaS)7$Oj z3nM6(-qGoOTkmQ5a{Hj0MExQ=Z?;^T*A85*y#8YSAy+?N?<4b*UeE*gc#6J&KhrNW z$pgi|{ts&QGAltrj{41r88w)N} z>q~J^b;Y_p6kP$|Ad2sKat)$?IC}l$`1__dHn+XS z69<&j+W)xz>6W6+v4;;Kzq_2y#-Ei0q{o1db@tT!udE4Zw2?N7z~67^Z4lF6ipRfu zM=$TfIr+#{N0pk8KO3JvX5+Ks{N?ahN^ea(fm9R7N!e9Kc-wXA-&vQC6fX*1;jEzkc^+DHAC%ZWQ`rvQzO_9?i~ z8PgHH7Wqlof7Q$J(_?Q(0v9ubhugnpux{5t6j z0ZIJE=1=eII`=FJiQ@f-jqeY}ui^#m#Z{&Q*mWzHPc8k|Z^$)Y{rN{We6`1O+MB?o zk(bZ(e>`D==x6uUUA-RnHzTcffc|+*_Ct&5v(}oY{gJ0c|1Hl#cV66^1oYPb&%Y_` zkET2IRbY&SewCA7IylbW9|K)&93Sx~iF>|oI1551bN-lqf15lHV*HyPSUJNwwI3rN zE0NoTEJQy*d7RD#W|3Aq;IsKpDPESc|c2NfQmcXYRTrM=ODng?Va(tP8 zsE>~Cj_)6v*-SdH$qd0FF8@@ufxnaMSN8WG$;r39>BhWWHOICmB8`pyReSTVwEUwUMJtH?`>t~N>08pDPJFarjD&wStLVAZhH+*X8pq3L<98zKf5U$L zW=GOT$+~na(o2jV<1U`Oj!FKYGD3u?j>0?~KT)emW3iP&d{yOdp^DAW${wT;s z`MD%G@UYJ?_1{^FDv5i|4|t7G`RTmbIp^y1H0)B(^#dE9ClmdIk=^~*Za;iqb-gAM z=p^jp{c9S#GP3>9n7N#P>UswmOuvoz^$($Yw?GdY`?s8aNm^S!Y02pr>@Hz{%cWPF z-r2lu!TqKjG^&EPKa<}R5$z{|$@^`Eb?6>0!Fc>nZzX#WNPS^JaOAK}vWhhX2`k+km;YhP?Y z)nUMMnA#so26QGX)n#}WO^PgXk{3pr#&p)h~{K@$=ci<;6e@*?;((^ipD)hY zojrCh3S7YPf0XvwXRVp$!h2e%k?^m|@#my71SI|cj*d_AQWn97f+hN=z4@P=e77*| zC=1o)4s@*^%LIBJ;czqXt$p}7gk(6x%|r=j?#S7&5od7kLT`aa&i z;`kfal(>pn#ph9whUd@d+gbZ79^d5T+q#6BkCoV6CHryG)}M~gns&8&U+B(k=-STl zVe+3v>VH~({FSR;*R5yYTXke2@~$6{zk%U9;^+Tz_^3^PO>iJ#Hwpg{o1LCAW1U*^ zH)T)l^@>CsqLTkZZTyCN{`EVz94kHL1~g{|^uJ*;zV#ko9J~B}KGJD9|17?XK0b!{ zNu}*iVlV&Huq+#EjYQv(51rqM<1gow`%Hh^>PP(loGxB}-_Sa?l(Zo>4Ab^gil5^2 zO*zTAv&`6IDaZE%dOOJYH5wdpJF*TymF6$ZJDQvezy^UoDkI{LT>VSpPvy033x%Fm zUO#3ZJ=h;6m;Ono7q30`M$)`q>?#qz+xHm(Hj3Ez|>;p%~M=5_>O|u(`j}^HB>QoiEk|6?92ELEG5u-6B?d@WT458^|c7Vq4=xVZ#r zzU6dr`@;U|>-!?x5Q^veV;k_@BBp8WcwTapXhik8J)z z*Kq7F4%NN}`!uF?=_5XysnGI2OB_KdzeIz2a)Npi05(tau_=wL3AlpyQ zqe~}%W-F(Q?F)O?U)bFihp0?`4a)wka~*BqFRLnZY4nX!4D zg+?D3@Fvfn;r}51r66nH&f@Epwj%Kn{Q%OdbFw9)<*j#rF_u#w{;RrR4tC!=X;XfBl?q<(&JMa`70I#xJ$;Q<{I) zD24!p68-bGk?z0?lsw;7Y54`=s}ep8Uyoo9w;vwm`I1WcbEanqs5bwX!?$3^cItt` zo)Z2i;dp>HJW68zHm%w_y?F@`T>b~mi!P@2Bw$Y) z{VQ_$t4$YP4|y9u1-Se%|IQ%K|Cyd<`Q9znf_){%r=AvVQ&V0G99_ZlABumqz|3|S zf?((VLZO2W_*Pp!?43V*`|pH8p#PT3&!c=llD+GvwEW(iKP1d8BQJaS1J~bT|K>@- zfB@bq*#}DbSDIc}elh}`O6yN#?^foN1_b^0aD2u46Ww~TIix-g2TAx>4j-j-vOT>hX?_CqvD5f5_uvtFgdfybas9={zhUxz8Kw9uO>YUD iS~d;lVILdq6Q$)VO-~O$aZgXf8j1P6T>gSq0N{VC+;A=c diff --git a/test/primers/y.zipcodes b/test/primers/y.zipcodes new file mode 100644 index 0000000000000000000000000000000000000000..fdcf9d09aa5c31cb10bf5e2f15988a157653fe1f GIT binary patch literal 98 zcmWFz@Qh++U|`^71QLwyOiT<6%uEbyKpqPtmoP|>;W{fb3&T!kAe)H=CMAGXJyt;g DCMg7g literal 0 HcmV?d00001 From a57c351580ddeafd005faae4608dc7af40123e0d Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 11:49:39 +0100 Subject: [PATCH 42/43] Fix format of test file --- test/primers/y.primer3.split.config | 24 ++ test/primers/y.split.out | 402 ++++++++++++++++++---------- 2 files changed, 292 insertions(+), 134 deletions(-) create mode 100644 test/primers/y.primer3.split.config diff --git a/test/primers/y.primer3.split.config b/test/primers/y.primer3.split.config new file mode 100644 index 00000000000..e34445c97b2 --- /dev/null +++ b/test/primers/y.primer3.split.config @@ -0,0 +1,24 @@ +SEQUENCE_ID=y,100 +SEQUENCE_TEMPLATE=TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 += +SEQUENCE_ID=y,601 +SEQUENCE_TEMPLATE=GACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 += diff --git a/test/primers/y.split.out b/test/primers/y.split.out index 54c5bdefed6..2cb765416ab 100644 --- a/test/primers/y.split.out +++ b/test/primers/y.split.out @@ -1,134 +1,268 @@ -PRIMER PICKING RESULTS FOR y|gene|feature|100 - -No mispriming library specified -Using 0-based sequence positions -OLIGO start len tm gc% any_th 3'_th hairpin seq -LEFT PRIMER 262 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC -RIGHT PRIMER 385 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG -SEQUENCE SIZE: 401 -INCLUDED REGION SIZE: 401 - -PRODUCT SIZE: 124, PAIR ANY_TH COMPL: 6.57, PAIR 3'_TH COMPL: 4.13 - - 0 TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAA - - - 60 TGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACA - - - 120 TATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCT - - - 180 CTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGAT - - - 240 GCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTT - >>>>>>>>>>>>>>>>>>>> - - 300 ACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACC - - - 360 TCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACA - <<<<<<<<<<<<<<<<<<<< - -KEYS (in order of precedence): ->>>>>> left primer -<<<<<< right primer - -ADDITIONAL OLIGOS - start len tm gc% any_th 3'_th hairpin seq - - 1 LEFT PRIMER 262 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC - RIGHT PRIMER 387 20 59.67 55.00 0.00 0.00 38.53 AAGCCAGAAGAGCCTCAAGG - PRODUCT SIZE: 126, PAIR ANY_TH COMPL: 11.71, PAIR 3'_TH COMPL: 4.81 - - 2 LEFT PRIMER 262 20 60.11 55.00 19.30 0.00 31.71 TGCCTGGCATAGAGGAAAGC - RIGHT PRIMER 386 20 60.55 55.00 0.00 0.00 37.39 AGCCAGAAGAGCCTCAAGGA - PRODUCT SIZE: 125, PAIR ANY_TH COMPL: 11.71, PAIR 3'_TH COMPL: 0.00 - - 3 LEFT PRIMER 274 21 59.93 52.38 0.00 0.00 42.19 AGGAAAGCCTCTGACAACTGG - RIGHT PRIMER 385 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG - PRODUCT SIZE: 112, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 4 LEFT PRIMER 275 21 60.20 52.38 0.00 0.00 0.00 GGAAAGCCTCTGACAACTGGT - RIGHT PRIMER 385 20 60.11 60.00 0.00 0.00 37.39 GCCAGAAGAGCCTCAAGGAG - PRODUCT SIZE: 111, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 6.95 - -Statistics - con too in in not no tm tm high high high high - sid many tar excl ok bad GC too too any_th 3'_th hair- poly end - ered Ns get reg reg GC% clamp low high compl compl pin X stab ok -Left 1635 0 0 0 0 20 0 1345 6 0 0 19 0 0 245 -Right 1635 0 0 0 0 16 0 1303 8 0 0 3 0 0 305 -Pair Stats: -considered 61, unacceptable product size 55, ok 6 -libprimer3 release 2.6.1 - - -PRIMER PICKING RESULTS FOR y|gene|feature|601 - -No mispriming library specified -Using 0-based sequence positions -OLIGO start len tm gc% any_th 3'_th hairpin seq -LEFT PRIMER 19 20 59.83 60.00 15.95 0.00 35.67 GAGTCGAGGCTCAAGGACAG -RIGHT PRIMER 163 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA -SEQUENCE SIZE: 400 -INCLUDED REGION SIZE: 400 - -PRODUCT SIZE: 145, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 0 GACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTG - >>>>>>>>>>>>>>>>>>>> - - 60 TATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTT - - - 120 TATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATC - <<<<<<<<<<<<<<<<<<<< - - 180 TCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTC - - - 240 AATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGC - - - 300 TTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTT - - - 360 AAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA - - -KEYS (in order of precedence): ->>>>>> left primer -<<<<<< right primer - -ADDITIONAL OLIGOS - start len tm gc% any_th 3'_th hairpin seq - - 1 LEFT PRIMER 17 20 59.83 60.00 15.95 13.47 37.94 CAGAGTCGAGGCTCAAGGAC - RIGHT PRIMER 163 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 147, PAIR ANY_TH COMPL: 0.00, PAIR 3'_TH COMPL: 0.00 - - 2 LEFT PRIMER 24 20 59.82 60.00 10.25 0.00 37.05 GAGGCTCAAGGACAGCTCTC - RIGHT PRIMER 163 20 60.18 55.00 0.00 0.00 0.00 AGGAGAGCTGGGAAAAGGGA - PRODUCT SIZE: 140, PAIR ANY_TH COMPL: 26.57, PAIR 3'_TH COMPL: 26.57 - - 3 LEFT PRIMER 218 20 59.67 55.00 1.00 0.00 34.56 TCCAGAAGCTGCTCTTTCCC - RIGHT PRIMER 353 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT - PRODUCT SIZE: 136, PAIR ANY_TH COMPL: 13.72, PAIR 3'_TH COMPL: 10.53 - - 4 LEFT PRIMER 219 20 59.67 55.00 1.00 0.00 34.56 CCAGAAGCTGCTCTTTCCCT - RIGHT PRIMER 353 20 60.03 55.00 0.00 0.00 0.00 GCCTGGGTAGCTTTGGATGT - PRODUCT SIZE: 135, PAIR ANY_TH COMPL: 12.02, PAIR 3'_TH COMPL: 3.44 - -Statistics - con too in in not no tm tm high high high high - sid many tar excl ok bad GC too too any_th 3'_th hair- poly end - ered Ns get reg reg GC% clamp low high compl compl pin X stab ok -Left 1630 0 0 0 0 0 0 1013 75 0 0 1 0 0 541 -Right 1630 0 0 0 0 0 0 1066 37 0 0 0 0 0 527 -Pair Stats: -considered 23, unacceptable product size 18, ok 5 -libprimer3 release 2.6.1 - - +SEQUENCE_ID=y|gene|feature|100 +SEQUENCE_TEMPLATE=TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTTCAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTACTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTTACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAACA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 +PRIMER_LEFT_EXPLAIN=considered 1635, GC content failed 20, low tm 1345, high tm 6, high hairpin stability 19, ok 245 +PRIMER_RIGHT_EXPLAIN=considered 1635, GC content failed 16, low tm 1303, high tm 8, high hairpin stability 3, ok 305 +PRIMER_PAIR_EXPLAIN=considered 61, unacceptable product size 55, ok 6 +PRIMER_LEFT_NUM_RETURNED=5 +PRIMER_RIGHT_NUM_RETURNED=5 +PRIMER_INTERNAL_NUM_RETURNED=0 +PRIMER_PAIR_NUM_RETURNED=5 +PRIMER_PAIR_0_PENALTY=0.214768 +PRIMER_LEFT_0_PENALTY=0.107017 +PRIMER_RIGHT_0_PENALTY=0.107752 +PRIMER_LEFT_0_SEQUENCE=TGCCTGGCATAGAGGAAAGC +PRIMER_RIGHT_0_SEQUENCE=GCCAGAAGAGCCTCAAGGAG +PRIMER_LEFT_0=262,20 +PRIMER_RIGHT_0=385,20 +PRIMER_LEFT_0_TM=60.107 +PRIMER_RIGHT_0_TM=60.108 +PRIMER_LEFT_0_GC_PERCENT=55.000 +PRIMER_RIGHT_0_GC_PERCENT=60.000 +PRIMER_LEFT_0_SELF_ANY_TH=19.30 +PRIMER_RIGHT_0_SELF_ANY_TH=0.00 +PRIMER_LEFT_0_SELF_END_TH=0.00 +PRIMER_RIGHT_0_SELF_END_TH=0.00 +PRIMER_LEFT_0_HAIRPIN_TH=31.71 +PRIMER_RIGHT_0_HAIRPIN_TH=37.39 +PRIMER_LEFT_0_END_STABILITY=3.5100 +PRIMER_RIGHT_0_END_STABILITY=3.6900 +PRIMER_PAIR_0_COMPL_ANY_TH=6.57 +PRIMER_PAIR_0_COMPL_END_TH=4.13 +PRIMER_PAIR_0_PRODUCT_SIZE=124 +PRIMER_PAIR_0_PRODUCT_TM=81.8 +PRIMER_PAIR_1_PENALTY=0.434608 +PRIMER_LEFT_1_PENALTY=0.107017 +PRIMER_RIGHT_1_PENALTY=0.327591 +PRIMER_LEFT_1_SEQUENCE=TGCCTGGCATAGAGGAAAGC +PRIMER_RIGHT_1_SEQUENCE=AAGCCAGAAGAGCCTCAAGG +PRIMER_LEFT_1=262,20 +PRIMER_RIGHT_1=387,20 +PRIMER_LEFT_1_TM=60.107 +PRIMER_RIGHT_1_TM=59.672 +PRIMER_LEFT_1_GC_PERCENT=55.000 +PRIMER_RIGHT_1_GC_PERCENT=55.000 +PRIMER_LEFT_1_SELF_ANY_TH=19.30 +PRIMER_RIGHT_1_SELF_ANY_TH=0.00 +PRIMER_LEFT_1_SELF_END_TH=0.00 +PRIMER_RIGHT_1_SELF_END_TH=0.00 +PRIMER_LEFT_1_HAIRPIN_TH=31.71 +PRIMER_RIGHT_1_HAIRPIN_TH=38.53 +PRIMER_LEFT_1_END_STABILITY=3.5100 +PRIMER_RIGHT_1_END_STABILITY=3.6100 +PRIMER_PAIR_1_COMPL_ANY_TH=11.71 +PRIMER_PAIR_1_COMPL_END_TH=4.81 +PRIMER_PAIR_1_PRODUCT_SIZE=126 +PRIMER_PAIR_1_PRODUCT_TM=81.6 +PRIMER_PAIR_2_PENALTY=0.653284 +PRIMER_LEFT_2_PENALTY=0.107017 +PRIMER_RIGHT_2_PENALTY=0.546267 +PRIMER_LEFT_2_SEQUENCE=TGCCTGGCATAGAGGAAAGC +PRIMER_RIGHT_2_SEQUENCE=AGCCAGAAGAGCCTCAAGGA +PRIMER_LEFT_2=262,20 +PRIMER_RIGHT_2=386,20 +PRIMER_LEFT_2_TM=60.107 +PRIMER_RIGHT_2_TM=60.546 +PRIMER_LEFT_2_GC_PERCENT=55.000 +PRIMER_RIGHT_2_GC_PERCENT=55.000 +PRIMER_LEFT_2_SELF_ANY_TH=19.30 +PRIMER_RIGHT_2_SELF_ANY_TH=0.00 +PRIMER_LEFT_2_SELF_END_TH=0.00 +PRIMER_RIGHT_2_SELF_END_TH=0.00 +PRIMER_LEFT_2_HAIRPIN_TH=31.71 +PRIMER_RIGHT_2_HAIRPIN_TH=37.39 +PRIMER_LEFT_2_END_STABILITY=3.5100 +PRIMER_RIGHT_2_END_STABILITY=3.3600 +PRIMER_PAIR_2_COMPL_ANY_TH=11.71 +PRIMER_PAIR_2_COMPL_END_TH=0.00 +PRIMER_PAIR_2_PRODUCT_SIZE=125 +PRIMER_PAIR_2_PRODUCT_TM=81.7 +PRIMER_PAIR_3_PENALTY=1.180717 +PRIMER_LEFT_3_PENALTY=1.072965 +PRIMER_RIGHT_3_PENALTY=0.107752 +PRIMER_LEFT_3_SEQUENCE=AGGAAAGCCTCTGACAACTGG +PRIMER_RIGHT_3_SEQUENCE=GCCAGAAGAGCCTCAAGGAG +PRIMER_LEFT_3=274,21 +PRIMER_RIGHT_3=385,20 +PRIMER_LEFT_3_TM=59.927 +PRIMER_RIGHT_3_TM=60.108 +PRIMER_LEFT_3_GC_PERCENT=52.381 +PRIMER_RIGHT_3_GC_PERCENT=60.000 +PRIMER_LEFT_3_SELF_ANY_TH=0.00 +PRIMER_RIGHT_3_SELF_ANY_TH=0.00 +PRIMER_LEFT_3_SELF_END_TH=0.00 +PRIMER_RIGHT_3_SELF_END_TH=0.00 +PRIMER_LEFT_3_HAIRPIN_TH=42.19 +PRIMER_RIGHT_3_HAIRPIN_TH=37.39 +PRIMER_LEFT_3_END_STABILITY=4.0000 +PRIMER_RIGHT_3_END_STABILITY=3.6900 +PRIMER_PAIR_3_COMPL_ANY_TH=0.00 +PRIMER_PAIR_3_COMPL_END_TH=0.00 +PRIMER_PAIR_3_PRODUCT_SIZE=112 +PRIMER_PAIR_3_PRODUCT_TM=80.7 +PRIMER_PAIR_4_PENALTY=1.310570 +PRIMER_LEFT_4_PENALTY=1.202818 +PRIMER_RIGHT_4_PENALTY=0.107752 +PRIMER_LEFT_4_SEQUENCE=GGAAAGCCTCTGACAACTGGT +PRIMER_RIGHT_4_SEQUENCE=GCCAGAAGAGCCTCAAGGAG +PRIMER_LEFT_4=275,21 +PRIMER_RIGHT_4=385,20 +PRIMER_LEFT_4_TM=60.203 +PRIMER_RIGHT_4_TM=60.108 +PRIMER_LEFT_4_GC_PERCENT=52.381 +PRIMER_RIGHT_4_GC_PERCENT=60.000 +PRIMER_LEFT_4_SELF_ANY_TH=0.00 +PRIMER_RIGHT_4_SELF_ANY_TH=0.00 +PRIMER_LEFT_4_SELF_END_TH=0.00 +PRIMER_RIGHT_4_SELF_END_TH=0.00 +PRIMER_LEFT_4_HAIRPIN_TH=0.00 +PRIMER_RIGHT_4_HAIRPIN_TH=37.39 +PRIMER_LEFT_4_END_STABILITY=4.0000 +PRIMER_RIGHT_4_END_STABILITY=3.6900 +PRIMER_PAIR_4_COMPL_ANY_TH=0.00 +PRIMER_PAIR_4_COMPL_END_TH=6.95 +PRIMER_PAIR_4_PRODUCT_SIZE=111 +PRIMER_PAIR_4_PRODUCT_TM=80.8 += +SEQUENCE_ID=y|gene|feature|601 +SEQUENCE_TEMPLATE=GACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAACAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATCTTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAGCTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATGA +PRIMER_TASK=generic +PRIMER_PICK_LEFT_PRIMER=1 +PRIMER_PICK_INTERNAL_OLIGO=0 +PRIMER_PICK_RIGHT_PRIMER=1 +PRIMER_OPT_SIZE=20 +PRIMER_MIN_SIZE=18 +PRIMER_MAX_SIZE=22 +PRIMER_PRODUCT_SIZE_RANGE=75-150 +PRIMER_EXPLAIN_FLAG=1 +PRIMER_LEFT_EXPLAIN=considered 1630, low tm 1013, high tm 75, high hairpin stability 1, ok 541 +PRIMER_RIGHT_EXPLAIN=considered 1630, low tm 1066, high tm 37, ok 527 +PRIMER_PAIR_EXPLAIN=considered 23, unacceptable product size 18, ok 5 +PRIMER_LEFT_NUM_RETURNED=5 +PRIMER_RIGHT_NUM_RETURNED=5 +PRIMER_INTERNAL_NUM_RETURNED=0 +PRIMER_PAIR_NUM_RETURNED=5 +PRIMER_PAIR_0_PENALTY=0.351214 +PRIMER_LEFT_0_PENALTY=0.172352 +PRIMER_RIGHT_0_PENALTY=0.178861 +PRIMER_LEFT_0_SEQUENCE=GAGTCGAGGCTCAAGGACAG +PRIMER_RIGHT_0_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_0=19,20 +PRIMER_RIGHT_0=163,20 +PRIMER_LEFT_0_TM=59.828 +PRIMER_RIGHT_0_TM=60.179 +PRIMER_LEFT_0_GC_PERCENT=60.000 +PRIMER_RIGHT_0_GC_PERCENT=55.000 +PRIMER_LEFT_0_SELF_ANY_TH=15.95 +PRIMER_RIGHT_0_SELF_ANY_TH=0.00 +PRIMER_LEFT_0_SELF_END_TH=0.00 +PRIMER_RIGHT_0_SELF_END_TH=0.00 +PRIMER_LEFT_0_HAIRPIN_TH=35.67 +PRIMER_RIGHT_0_HAIRPIN_TH=0.00 +PRIMER_LEFT_0_END_STABILITY=3.5100 +PRIMER_RIGHT_0_END_STABILITY=4.2000 +PRIMER_PAIR_0_COMPL_ANY_TH=0.00 +PRIMER_PAIR_0_COMPL_END_TH=0.00 +PRIMER_PAIR_0_PRODUCT_SIZE=145 +PRIMER_PAIR_0_PRODUCT_TM=83.5 +PRIMER_PAIR_1_PENALTY=0.351214 +PRIMER_LEFT_1_PENALTY=0.172352 +PRIMER_RIGHT_1_PENALTY=0.178861 +PRIMER_LEFT_1_SEQUENCE=CAGAGTCGAGGCTCAAGGAC +PRIMER_RIGHT_1_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_1=17,20 +PRIMER_RIGHT_1=163,20 +PRIMER_LEFT_1_TM=59.828 +PRIMER_RIGHT_1_TM=60.179 +PRIMER_LEFT_1_GC_PERCENT=60.000 +PRIMER_RIGHT_1_GC_PERCENT=55.000 +PRIMER_LEFT_1_SELF_ANY_TH=15.95 +PRIMER_RIGHT_1_SELF_ANY_TH=0.00 +PRIMER_LEFT_1_SELF_END_TH=13.47 +PRIMER_RIGHT_1_SELF_END_TH=0.00 +PRIMER_LEFT_1_HAIRPIN_TH=37.94 +PRIMER_RIGHT_1_HAIRPIN_TH=0.00 +PRIMER_LEFT_1_END_STABILITY=3.8500 +PRIMER_RIGHT_1_END_STABILITY=4.2000 +PRIMER_PAIR_1_COMPL_ANY_TH=0.00 +PRIMER_PAIR_1_COMPL_END_TH=0.00 +PRIMER_PAIR_1_PRODUCT_SIZE=147 +PRIMER_PAIR_1_PRODUCT_TM=83.6 +PRIMER_PAIR_2_PENALTY=0.354392 +PRIMER_LEFT_2_PENALTY=0.175531 +PRIMER_RIGHT_2_PENALTY=0.178861 +PRIMER_LEFT_2_SEQUENCE=GAGGCTCAAGGACAGCTCTC +PRIMER_RIGHT_2_SEQUENCE=AGGAGAGCTGGGAAAAGGGA +PRIMER_LEFT_2=24,20 +PRIMER_RIGHT_2=163,20 +PRIMER_LEFT_2_TM=59.824 +PRIMER_RIGHT_2_TM=60.179 +PRIMER_LEFT_2_GC_PERCENT=60.000 +PRIMER_RIGHT_2_GC_PERCENT=55.000 +PRIMER_LEFT_2_SELF_ANY_TH=10.25 +PRIMER_RIGHT_2_SELF_ANY_TH=0.00 +PRIMER_LEFT_2_SELF_END_TH=0.00 +PRIMER_RIGHT_2_SELF_END_TH=0.00 +PRIMER_LEFT_2_HAIRPIN_TH=37.05 +PRIMER_RIGHT_2_HAIRPIN_TH=0.00 +PRIMER_LEFT_2_END_STABILITY=3.2000 +PRIMER_RIGHT_2_END_STABILITY=4.2000 +PRIMER_PAIR_2_COMPL_ANY_TH=26.57 +PRIMER_PAIR_2_COMPL_END_TH=26.57 +PRIMER_PAIR_2_PRODUCT_SIZE=140 +PRIMER_PAIR_2_PRODUCT_TM=83.2 +PRIMER_PAIR_3_PENALTY=0.360353 +PRIMER_LEFT_3_PENALTY=0.326264 +PRIMER_RIGHT_3_PENALTY=0.034089 +PRIMER_LEFT_3_SEQUENCE=TCCAGAAGCTGCTCTTTCCC +PRIMER_RIGHT_3_SEQUENCE=GCCTGGGTAGCTTTGGATGT +PRIMER_LEFT_3=218,20 +PRIMER_RIGHT_3=353,20 +PRIMER_LEFT_3_TM=59.674 +PRIMER_RIGHT_3_TM=60.034 +PRIMER_LEFT_3_GC_PERCENT=55.000 +PRIMER_RIGHT_3_GC_PERCENT=55.000 +PRIMER_LEFT_3_SELF_ANY_TH=1.00 +PRIMER_RIGHT_3_SELF_ANY_TH=0.00 +PRIMER_LEFT_3_SELF_END_TH=0.00 +PRIMER_RIGHT_3_SELF_END_TH=0.00 +PRIMER_LEFT_3_HAIRPIN_TH=34.56 +PRIMER_RIGHT_3_HAIRPIN_TH=0.00 +PRIMER_LEFT_3_END_STABILITY=3.9700 +PRIMER_RIGHT_3_END_STABILITY=3.0600 +PRIMER_PAIR_3_COMPL_ANY_TH=13.72 +PRIMER_PAIR_3_COMPL_END_TH=10.53 +PRIMER_PAIR_3_PRODUCT_SIZE=136 +PRIMER_PAIR_3_PRODUCT_TM=83.6 +PRIMER_PAIR_4_PENALTY=0.361680 +PRIMER_LEFT_4_PENALTY=0.327591 +PRIMER_RIGHT_4_PENALTY=0.034089 +PRIMER_LEFT_4_SEQUENCE=CCAGAAGCTGCTCTTTCCCT +PRIMER_RIGHT_4_SEQUENCE=GCCTGGGTAGCTTTGGATGT +PRIMER_LEFT_4=219,20 +PRIMER_RIGHT_4=353,20 +PRIMER_LEFT_4_TM=59.672 +PRIMER_RIGHT_4_TM=60.034 +PRIMER_LEFT_4_GC_PERCENT=55.000 +PRIMER_RIGHT_4_GC_PERCENT=55.000 +PRIMER_LEFT_4_SELF_ANY_TH=1.00 +PRIMER_RIGHT_4_SELF_ANY_TH=0.00 +PRIMER_LEFT_4_SELF_END_TH=0.00 +PRIMER_RIGHT_4_SELF_END_TH=0.00 +PRIMER_LEFT_4_HAIRPIN_TH=34.56 +PRIMER_RIGHT_4_HAIRPIN_TH=0.00 +PRIMER_LEFT_4_END_STABILITY=4.2000 +PRIMER_RIGHT_4_END_STABILITY=3.0600 +PRIMER_PAIR_4_COMPL_ANY_TH=12.02 +PRIMER_PAIR_4_COMPL_END_TH=3.44 +PRIMER_PAIR_4_PRODUCT_SIZE=135 +PRIMER_PAIR_4_PRODUCT_TM=83.8 += From 8968c1488fefce55b51356073b3bf3826eb8cc4d Mon Sep 17 00:00:00 2001 From: Xian Date: Mon, 20 Jan 2025 12:39:15 +0100 Subject: [PATCH 43/43] Change unit test file names to assume it is being run from the test directory --- src/unittest/primer_filter.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/unittest/primer_filter.cpp b/src/unittest/primer_filter.cpp index 9198991d7e7..44a7aa28829 100644 --- a/src/unittest/primer_filter.cpp +++ b/src/unittest/primer_filter.cpp @@ -40,17 +40,17 @@ using namespace std; gbwtgraph::GBWTGraph gbwt_graph; gbwt::GBWT gbwt_index; gbwt::FastLocate r_index; - string snarl_index_path = "test/primers/y.dist"; - string xg_graph_path = "test/primers/y.xg"; + string snarl_index_path = "primers/y.dist"; + string xg_graph_path = "primers/y.xg"; distance_index.deserialize(snarl_index_path); graph = vg::io::VPKG::load_one(xg_graph_path); - load_r_index(r_index, "test/primers/y.ri"); - load_gbz(gbwt_index, gbwt_graph, "test/primers/y.giraffe.gbz"); + load_r_index(r_index, "primers/y.ri"); + load_gbz(gbwt_index, gbwt_graph, "primers/y.giraffe.gbz"); gbwt_graph.set_gbwt(gbwt_index); r_index.setGBWT(gbwt_index); SECTION("template_position=0") { - string primers_path = "test/primers/y.primer3_with_ref_pos.out"; + string primers_path = "primers/y.primer3_with_ref_pos.out"; ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index); @@ -164,7 +164,7 @@ using namespace std; } SECTION("template_position=11") { - string primers_path = "test/primers/y.primer3_with_ref_pos_11.out"; + string primers_path = "primers/y.primer3_with_ref_pos_11.out"; ifstream file_handle(primers_path); PrimerFinder primer_finder(graph, &distance_index, file_handle, gbwt_graph, gbwt_index, r_index); @@ -277,11 +277,11 @@ using namespace std; } } SECTION("template_position=11, no path name") { - string primers_path = "test/primers/y.primer3_with_ref_pos_11.nopath.out"; + string primers_path = "primers/y.primer3_with_ref_pos_11.nopath.out"; ifstream file_handle(primers_path); - unique_ptr minimizer_index = vg::io::VPKG::load_one("test/primers/y.min"); + unique_ptr minimizer_index = vg::io::VPKG::load_one("primers/y.min"); ZipCodeCollection oversized_zipcodes; - ifstream zip_in ("test/primers/y.zipcodes"); + ifstream zip_in ("primers/y.zipcodes"); oversized_zipcodes.deserialize(zip_in); zip_in.close(); MinimizerMapper giraffe_mapper(gbwt_graph, *minimizer_index, &distance_index, &oversized_zipcodes);